refactor: Initial and dirty refactor to replace link with snapshot. Barely functional add command

2024-06-24 17:10:21 +12:00 · 2020-12-23 14:51:42 -05:00 · 2020-12-23 14:51:42 -05:00 · 8c4ae73d65
parent 8e2270e21b
commit 8c4ae73d65
13 changed files with 246 additions and 233 deletions
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -21,7 +21,7 @@ from util import htmldecode, urldecode, ansi_to_html
 from logging_util import printable_filesize
 from main import add, remove
 from config import OUTPUT_DIR
-from extractors import archive_links
+from extractors import archive_snapshots

 # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel

--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -1,6 +1,7 @@
 __package__ = 'archivebox.core'

 import uuid
+from pathlib import Path

 from django.db import models, transaction
 from django.utils.functional import cached_property
@ -9,9 +10,10 @@ from django.db.models import Case, When, Value, IntegerField

 from ..util import parse_date
 from ..index.schema import Link
-from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
+from ..config import CONFIG

-EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
+#EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
+EXTRACTORS = ["title", "wget"]
 STATUS_CHOICES = [
    ("succeeded", "succeeded"),
    ("failed", "failed"),
@ -89,6 +91,7 @@ class Snapshot(models.Model):
        title = self.title or '-'
        return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'

+
    @classmethod
    def from_json(cls, info: dict):
        info = {k: v for k, v in info.items() if k in cls.keys}
@ -133,8 +136,9 @@ class Snapshot(models.Model):
        return self.as_link().base_url

    @cached_property
-    def link_dir(self):
-        return self.as_link().link_dir
+    def snapshot_dir(self):
+        from ..config import CONFIG
+        return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)

    @cached_property
    def archive_path(self):
@ -158,6 +162,16 @@ class Snapshot(models.Model):
            return self.history['title'][-1].output.strip()
        return None

+    def _asdict(self):
+        return {
+            "id": str(self.id),
+            "url": self.url,
+            "timestamp": self.timestamp,
+            "title": self.title,
+            "added": self.added,
+            "updated": self.updated,
+        }
+
    def save_tags(self, tags=()):
        tags_id = []
        for tag in tags:
@ -168,6 +182,7 @@ class Snapshot(models.Model):

 class ArchiveResultManager(models.Manager):
    def indexable(self, sorted: bool = True):
+        from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
        INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')

--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -4,19 +4,20 @@ import os
 from pathlib import Path

 from typing import Optional, List, Iterable, Union
-from datetime import datetime
-from django.db.models import QuerySet

-from ..index.schema import Link
-from ..index.sql import write_link_to_sql_index
+from datetime import datetime
+from django.db.models import QuerySet, Model
+
+from ..index.sql import write_snapshot_to_index
 from ..index import (
-    load_link_details,
-    write_link_details,
+    load_snapshot_details,
+    write_snapshot_details,
 )
 from ..util import enforce_types
 from ..logging_util import (
    log_archiving_started,
    log_archiving_paused,
+
    log_archiving_finished,
    log_link_archiving_started,
    log_link_archiving_finished,
@ -67,15 +68,9 @@ def ignore_methods(to_ignore: List[str]):
    return list(methods)

@enforce_types
-def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
+def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Model:
    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
-
-    # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
-    from core.models import Snapshot, ArchiveResult
-    try:
-        snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
-    except Snapshot.DoesNotExist:
-        snapshot = write_link_to_sql_index(link)
+    from core.models import ArchiveResult

    ARCHIVE_METHODS = get_default_archive_methods()
    
@ -85,33 +80,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
            if method[0] in methods
        ]

-    out_dir = out_dir or Path(link.link_dir)
+    out_dir = out_dir or Path(snapshot.snapshot_dir)
    try:
        is_new = not Path(out_dir).exists()
        if is_new:
            os.makedirs(out_dir)
+            details = {"history": {}}
+            write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
+        else:
+            details = list(load_snapshot_details(snapshot))

-        link = load_link_details(link, out_dir=out_dir)
-        write_link_details(link, out_dir=out_dir, skip_sql_index=False)
-        log_link_archiving_started(link, out_dir, is_new)
-        link = link.overwrite(updated=datetime.now())
+        #log_link_archiving_started(link, out_dir, is_new)
        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}

        for method_name, should_run, method_function in ARCHIVE_METHODS:
            try:
-                if method_name not in link.history:
-                    link.history[method_name] = []
+                if method_name not in details["history"]:
+                    details["history"][method_name] = []

-                if should_run(link, out_dir) or overwrite:
+                if should_run(snapshot, out_dir) or overwrite:
                    log_archive_method_started(method_name)

-                    result = method_function(link=link, out_dir=out_dir)
+                    result = method_function(snapshot=snapshot, out_dir=out_dir)

-                    link.history[method_name].append(result)
+                    details["history"][method_name].append(result)

                    stats[result.status] += 1
                    log_archive_method_finished(result)
-                    write_search_index(link=link, texts=result.index_texts)
+                    write_search_index(snapshot=snapshot, texts=result.index_texts)
                    ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                                                 output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)

@ -121,7 +117,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
            except Exception as e:
                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
                    method_name,
-                    link.url,
+                    snapshot.url,
                )) from e

        # print('    ', stats)
@ -129,17 +125,17 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
        try:
            latest_title = link.history['title'][-1].output.strip()
            if latest_title and len(latest_title) >= len(link.title or ''):
-                link = link.overwrite(title=latest_title)
+                snapshot.title = latest_title
        except Exception:
            pass

-        write_link_details(link, out_dir=out_dir, skip_sql_index=False)
+        write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)

-        log_link_archiving_finished(link, link.link_dir, is_new, stats)
+        log_link_archiving_finished(snapshot, snapshot.snapshot_dir, is_new, stats)

    except KeyboardInterrupt:
        try:
-            write_link_details(link, out_dir=link.link_dir)
+            write_snapshot_details(snapshot, out_dir=link.link_dir)
        except:
            pass
        raise
@ -148,35 +144,29 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
        raise

-    return link
+    return snapshot

@enforce_types
-def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]:
+def archive_snapshots(all_snapshots: Union[QuerySet, List[Model]], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> QuerySet:

-    if type(all_links) is QuerySet:
-        num_links: int = all_links.count()
-        get_link = lambda x: x.as_link()
-        all_links = all_links.iterator()
-    else:
-        num_links: int = len(all_links)
-        get_link = lambda x: x
+    all_snapshots = list(all_snapshots)
+    num_snapshots: int = len(all_snapshots)

-    if num_links == 0:
+    if num_snapshots == 0:
        return []

-    log_archiving_started(num_links)
+    log_archiving_started(num_snapshots)
    idx: int = 0
    try:
-        for link in all_links:
+        for snapshot in all_snapshots:
            idx += 1
-            to_archive = get_link(link)
-            archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))
+            archive_snapshot(snapshot, overwrite=overwrite, methods=methods, out_dir=Path(snapshot.snapshot_dir))
    except KeyboardInterrupt:
-        log_archiving_paused(num_links, idx, link.timestamp)
+        log_archiving_paused(num_snapshots, idx, snapshot.timestamp)
        raise SystemExit(0)
    except BaseException:
        print()
        raise

-    log_archiving_finished(num_links)
-    return all_links
+    log_archiving_finished(num_snapshots)
+    return all_snapshots
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -5,7 +5,9 @@ from html.parser import HTMLParser
 from pathlib import Path
 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from django.db.models import Model
+
+from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
 from ..util import (
    enforce_types,
    is_static_file,
@ -61,12 +63,12 @@ class TitleParser(HTMLParser):


@enforce_types
-def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
+def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool:
    # if link already has valid title, skip it
-    if link.title and not link.title.lower().startswith('http'):
+    if snapshot.title and not snapshot.title.lower().startswith('http'):
        return False

-    if is_static_file(link.url):
+    if is_static_file(snapshot.url):
        return False

    return SAVE_TITLE
@ -77,7 +79,7 @@ def extract_title_with_regex(html):
    return output

@enforce_types
-def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """try to guess the page's title from its content"""

    from core.models import Snapshot
@ -89,12 +91,12 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
        '--max-time', str(timeout),
        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        link.url,
+        snapshot.url,
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
-        html = download_url(link.url, timeout=timeout)
+        html = download_url(snapshot.url, timeout=timeout)
        try:
            # try using relatively strict html parser first
            parser = TitleParser()
@ -108,10 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
        
        # if title is better than the one in the db, update db with new title
        if isinstance(output, str) and output:
-            if not link.title or len(output) >= len(link.title):
-                Snapshot.objects.filter(url=link.url,
-                                        timestamp=link.timestamp)\
+            if not snapshot.title or len(output) >= len(snapshot.title):
+                Snapshot.objects.filter(url=snapshot.url,
+                                        timestamp=snapshot.timestamp)\
                                .update(title=output)
+                snapshot.title = output
        else:
            raise ArchiveError('Unable to detect page title')
    except Exception as err:
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -10,7 +10,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from contextlib import contextmanager
 from urllib.parse import urlparse
-from django.db.models import QuerySet, Q
+from django.db.models import QuerySet, Q, Model

 from ..util import (
    scheme,
@ -39,15 +39,15 @@ from ..logging_util import (

 from .schema import Link, ArchiveResult
 from .html import (
-    write_html_link_details,
+    write_html_snapshot_details,
 )
 from .json import (
-    parse_json_link_details, 
-    write_json_link_details,
+    parse_json_snapshot_details, 
+    write_json_snapshot_details,
 )
 from .sql import (
    write_sql_main_index,
-    write_sql_link_details,
+    write_sql_snapshot_details,
 )

 from ..search import search_backend_enabled, query_search_index
@ -55,10 +55,12 @@ from ..search import search_backend_enabled, query_search_index
 ### Link filtering and checking

@enforce_types
-def merge_links(a: Link, b: Link) -> Link:
-    """deterministially merge two links, favoring longer field values over shorter,
+def merge_snapshots(a: Model, b: Model) -> Model:
+    """deterministially merge two snapshots, favoring longer field values over shorter,
    and "cleaner" values over worse ones.
+    TODO: Check if this makes sense with the new setup
    """
+    return a
    assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})'

    # longest url wins (because a fuzzy url will always be shorter)
@ -109,55 +111,55 @@ def merge_links(a: Link, b: Link) -> Link:
            key=lambda result: result.start_ts,
        )))

-    return Link(
+    return Snapshot(
        url=url,
        timestamp=timestamp,
        title=title,
        tags=tags,
-        sources=sources,
-        history=history,
+        #sources=sources,
+        #history=history,
    )


@enforce_types
-def validate_links(links: Iterable[Link]) -> List[Link]:
+def validate_snapshots(snapshots: List[Model]) -> List[Model]:
    timer = TimedProgress(TIMEOUT * 4)
    try:
-        links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
-        links = sorted_links(links)      # deterministically sort the links based on timestamp, url
-        links = fix_duplicate_links(links)  # merge/dedupe duplicate timestamps & urls
+        snapshots = archivable_snapshots(snapshots)  # remove chrome://, about:, mailto: etc.
+        snapshots = sorted_snapshots(snapshots)      # deterministically sort the links based on timestamp, url
+        snapshots = fix_duplicate_snapshots(snapshots)  # merge/dedupe duplicate timestamps & urls
    finally:
        timer.end()

-    return list(links)
+    return list(snapshots)

@enforce_types
-def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
+def archivable_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
    """remove chrome://, about:// or other schemed links that cant be archived"""
-    for link in links:
+    for snapshot in snapshots:
        try:
-            urlparse(link.url)
+            urlparse(snapshot.url)
        except ValueError:
            continue
-        if scheme(link.url) not in ('http', 'https', 'ftp'):
+        if scheme(snapshot.url) not in ('http', 'https', 'ftp'):
            continue
-        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
+        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(snapshot.url):
            continue

-        yield link
+        yield snapshot


@enforce_types
-def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
+def fix_duplicate_snapshots(sorted_snapshots: Iterable[Model]) -> Iterable[Model]:
    """
    ensures that all non-duplicate links have monotonically increasing timestamps
+    TODO: Review how to do this with the new snapshots refactor
    """
-    # from core.models import Snapshot
-
+    return sorted_snapshots
    unique_urls: OrderedDict[str, Link] = OrderedDict()

-    for link in sorted_links:
-        if link.url in unique_urls:
+    for snapshot in sorted_snapshots:
+        if snapshot.url in unique_urls:
            # merge with any other links that share the same url
            link = merge_links(unique_urls[link.url], link)
        unique_urls[link.url] = link
@ -166,9 +168,9 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:


@enforce_types
-def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
-    sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
-    return sorted(links, key=sort_func, reverse=True)
+def sorted_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
+    sort_func = lambda snapshot: (snapshot.timestamp.split('.', 1)[0], snapshot.url)
+    return sorted(snapshots, key=sort_func, reverse=True)


@enforce_types
@ -222,14 +224,14 @@ def timed_index_update(out_path: Path):


@enforce_types
-def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
+def write_main_index(snapshots: List[Model], out_dir: Path=OUTPUT_DIR) -> None:
    """Writes links to sqlite3 file for a given list of links"""

-    log_indexing_process_started(len(links))
+    log_indexing_process_started(len(snapshots))

    try:
        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
-            write_sql_main_index(links, out_dir=out_dir)
+            write_sql_main_index(snapshots, out_dir=out_dir)
            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes

    except (KeyboardInterrupt, SystemExit):
@ -244,7 +246,10 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:

@enforce_types
 def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
-    """parse and load existing index with any new links from import_path merged in"""
+    """
+    Returns all of the snapshots currently in index
+    """
+    setup_django(out_dir, check_db=True)
    from core.models import Snapshot
    try:
        return Snapshot.objects.all()
@ -265,88 +270,62 @@ def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:


@enforce_types
-def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
+def parse_snapshots_from_source(source_path: str, root_url: Optional[str]=None) -> List[Model]:

-    from ..parsers import parse_links
+    from ..parsers import parse_snapshots

-    new_links: List[Link] = []
+    new_links: List[Model] = []

    # parse and validate the import file
-    raw_links, parser_name = parse_links(source_path, root_url=root_url)
-    new_links = validate_links(raw_links)
+    raw_snapshots, parser_name = parse_snapshots(source_path, root_url=root_url)
+    new_snapshots = validate_snapshots(raw_snapshots)

    if parser_name:
-        num_parsed = len(raw_links)
+        num_parsed = len(raw_snapshots)
        log_parsing_finished(num_parsed, parser_name)

-    return new_links
+    return new_snapshots

@enforce_types
-def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
+def filter_new_urls(snapshots: QuerySet,
+                 new_snapshots: List) -> List:
    """
-    Given a list of in-memory Links, dedupe and merge them with any conflicting Snapshots in the DB.
+    Returns a list of Snapshots corresponding to the urls that were not present in the index
    """
-    unique_urls: OrderedDict[str, Link] = OrderedDict()
+    urls = {snapshot.url: snapshot for snapshot in new_snapshots}
+    filtered_snapshots = snapshots.filter(url__in=urls.keys())

-    for link in links:
-        index_link = snapshots.filter(url=link.url)
-        if index_link:
-            link = merge_links(index_link[0].as_link(), link)
-
-        unique_urls[link.url] = link
-
-    return unique_urls.values()
-
-@enforce_types
-def dedupe_links(snapshots: QuerySet,
-                 new_links: List[Link]) -> List[Link]:
-    """
-    The validation of links happened at a different stage. This method will
-    focus on actual deduplication and timestamp fixing.
-    """
+    for found_snapshot in filtered_snapshots:
+        urls.pop(found_snapshot.url)
    
-    # merge existing links in out_dir and new links
-    dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
+    log_deduping_finished(len(urls.keys()))

-    new_links = [
-        link for link in new_links
-        if not snapshots.filter(url=link.url).exists()
-    ]
-
-    dedup_links_dict = {link.url: link for link in dedup_links}
-
-    # Replace links in new_links with the dedup version
-    for i in range(len(new_links)):
-        if new_links[i].url in dedup_links_dict.keys():
-            new_links[i] = dedup_links_dict[new_links[i].url]
-    log_deduping_finished(len(new_links))
-
-    return new_links
+    return list(urls.values())

 ### Link Details Index

@enforce_types
-def write_link_details(link: Link, out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
-    out_dir = out_dir or link.link_dir
+def write_snapshot_details(snapshot: List[Model], out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
+    out_dir = out_dir or snapshot.snapshot_dir

-    write_json_link_details(link, out_dir=out_dir)
-    write_html_link_details(link, out_dir=out_dir)
+    write_json_snapshot_details(snapshot, out_dir=out_dir)
+    #write_html_snapshot_details(snapshot, out_dir=out_dir) TODO: Refactor html code too
    if not skip_sql_index:
-        write_sql_link_details(link)
+        write_sql_snapshot_details(snapshot)


@enforce_types
-def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
+def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model:
    """check for an existing link archive in the given directory, 
       and load+merge it into the given link dict
    """
-    out_dir = out_dir or link.link_dir
+    out_dir = out_dir or snapshot.snapshot_dir

-    existing_link = parse_json_link_details(out_dir)
-    if existing_link:
-        return merge_links(existing_link, link)
+    existing_snapshot = parse_json_snapshot_details(out_dir)
+    if existing_snapshot:
+        return merge_snapshots(existing_snapshot, snapshot)

-    return link
+    return snapshot



--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -5,6 +5,7 @@ from typing import List, Optional, Iterator, Mapping
 from pathlib import Path

 from django.utils.html import format_html
+from django.db.models import Model
 from collections import defaultdict

 from .schema import Link
@ -71,8 +72,8 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
 ### Link Details Index

@enforce_types
-def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
-    out_dir = out_dir or link.link_dir
+def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
+    out_dir = out_dir or snapshot.snapshot_dir

    rendered_html = link_details_template(link)
    atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@ -7,6 +7,7 @@ from pathlib import Path

 from datetime import datetime
 from typing import List, Optional, Iterator, Any, Union
+from django.db.models import Model

 from .schema import Link
 from ..system import atomic_write
@ -81,16 +82,17 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
 ### Link Details Index

@enforce_types
-def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
-    """write a json file with some info about the link"""
+def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
+    """write a json file with some info about the snapshot"""
    
-    out_dir = out_dir or link.link_dir
+    out_dir = out_dir or snapshot.snapshot_dir
    path = Path(out_dir) / JSON_INDEX_FILENAME
-    atomic_write(str(path), link._asdict(extended=True))
+    print(snapshot._asdict())
+    atomic_write(str(path), snapshot._asdict())


@enforce_types
-def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
+def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]:
    """load the json link index from a given directory"""
    existing_index = Path(out_dir) / JSON_INDEX_FILENAME
    if existing_index.exists():
@ -102,16 +104,31 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
                pass
    return None

+@enforce_types
+def load_snapshot_details(snapshot: Model, out_dir: Path):
+    """
+    Loads the detail from the local json index
+    """
+    existing_index = Path(out_dir) / JSON_INDEX_FILENAME
+    if existing_index.exists():
+        with open(existing_index, 'r', encoding='utf-8') as f:
+            try:
+                return pyjson.load(f)
+            except pyjson.JSONDecodeError:
+                pass
+    return None
+
+

@enforce_types
-def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
+def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]:
    """read through all the archive data folders and return the parsed links"""

    for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
        if entry.is_dir(follow_symlinks=True):
            if (Path(entry.path) / 'index.json').exists():
                try:
-                    link = parse_json_link_details(entry.path)
+                    link = parse_json_snapshot_details(entry.path)
                except KeyError:
                    link = None
                if link:
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@ -3,8 +3,9 @@ __package__ = 'archivebox.index'
 from io import StringIO
 from pathlib import Path
 from typing import List, Tuple, Iterator
-from django.db.models import QuerySet
+from django.db.models import QuerySet, Model
 from django.db import transaction
+from datetime import datetime

 from .schema import Link
 from ..util import enforce_types
@ -28,21 +29,20 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
        snapshots.delete()

@enforce_types
-def write_link_to_sql_index(link: Link):
+def write_snapshot_to_index(snapshot: Model):
    from core.models import Snapshot
-    info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
-    tags = info.pop("tags")
-    if tags is None:
-        tags = []
-
    try:
-        info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
+        timestamp = Snapshot.objects.get(url=snapshot.url).timestamp
    except Snapshot.DoesNotExist:
-        while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
-            info["timestamp"] = str(float(info["timestamp"]) + 1.0)
+        timestamp = snapshot.timestamp
+        if not timestamp:
+            timestamp = str(datetime.now().timestamp())
+        while Snapshot.objects.filter(timestamp=timestamp).exists():
+            print("the timestamp is: ", timestamp)
+            timestamp = str(float(timestamp) + 1.0)

-    snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
-    snapshot.save_tags(tags)
+    snapshot.timestamp = timestamp
+    snapshot.save()
    return snapshot


@ -50,27 +50,29 @@ def write_link_to_sql_index(link: Link):
 def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
    with transaction.atomic():
        for link in links:
-            write_link_to_sql_index(link)
+            write_snapshot_to_index(link)
            

@enforce_types
-def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
+def write_sql_snapshot_details(snapshot: Model, out_dir: Path=OUTPUT_DIR) -> None:
    from core.models import Snapshot

    with transaction.atomic():
        try:
-            snap = Snapshot.objects.get(url=link.url)
+            snap = Snapshot.objects.get(url=snapshot.url)
        except Snapshot.DoesNotExist:
-            snap = write_link_to_sql_index(link)
-        snap.title = link.title
+            snap = write_snapshot_to_sql_index(snapshot)
+        snap.title = snapshot.title

-        tag_set = (
-            set(tag.strip() for tag in (link.tags or '').split(','))
-        )
-        tag_list = list(tag_set) or []
+        # TODO: If there are actual tags, this will break
+        #tag_set = (
+        #    set(tag.strip() for tag in (snapshot.tags.all() or '').split(','))
+        #)
+        #tag_list = list(tag_set) or []

        snap.save()
-        snap.save_tags(tag_list)
+        #snap.save_tags(tag_list)
+        return snap



--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -29,8 +29,9 @@ from .util import enforce_types                         # type: ignore
 from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
 from .index import (
    load_main_index,
-    parse_links_from_source,
-    dedupe_links,
+    get_empty_snapshot_queryset,
+    parse_snapshots_from_source,
+    filter_new_urls,
    write_main_index,
    snapshot_filter,
    get_indexed_folders,
@ -44,11 +45,11 @@ from .index import (
    get_corrupted_folders,
    get_unrecognized_folders,
    fix_invalid_folder_locations,
-    write_link_details,
+    write_snapshot_details,
 )
 from .index.json import (
    parse_json_main_index,
-    parse_json_links_details,
+    parse_json_snapshot_details,
    generate_json_index_from_links,
 )
 from .index.sql import (
@ -60,7 +61,7 @@ from .index.html import (
    generate_index_from_links,
 )
 from .index.csv import links_to_csv
-from .extractors import archive_links, archive_link, ignore_methods
+from .extractors import archive_snapshots, archive_snapshot, ignore_methods
 from .config import (
    stderr,
    hint,
@ -538,6 +539,7 @@ def add(urls: Union[str, List[str]],
        extractors: str="",
        out_dir: Path=OUTPUT_DIR) -> List[Link]:
    """Add a new URL or list of URLs to your archive"""
+    from core.models import Snapshot

    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'

@ -549,8 +551,8 @@ def add(urls: Union[str, List[str]],
    # Load list of links from the existing index
    check_data_folder(out_dir=out_dir)
    check_dependencies()
-    new_links: List[Link] = []
-    all_links = load_main_index(out_dir=out_dir)
+    new_snapshots: List[Snapshot] = []
+    all_snapshots = load_main_index(out_dir=out_dir)

    log_importing_started(urls=urls, depth=depth, index_only=index_only)
    if isinstance(urls, str):
@ -560,20 +562,21 @@ def add(urls: Union[str, List[str]],
        # save verbatim args to sources
        write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
    
-    new_links += parse_links_from_source(write_ahead_log, root_url=None)
+    new_snapshots += parse_snapshots_from_source(write_ahead_log, root_url=None)

    # If we're going one level deeper, download each link and look for more links
-    new_links_depth = []
-    if new_links and depth == 1:
-        log_crawl_started(new_links)
-        for new_link in new_links:
-            downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
-            new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
+    new_snapshots_depth = []
+    if new_snapshots and depth == 1:
+        log_crawl_started(new_snapshots)
+        for new_snapshot in new_snapshots:
+            # TODO: Check if we need to add domain to the Snapshot model
+            downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
+            new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url)

-    imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
-    new_links = dedupe_links(all_links, imported_links)
+    imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
+    new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)

-    write_main_index(links=new_links, out_dir=out_dir)
+    write_main_index(snapshots=new_snapshots, out_dir=out_dir)
    all_links = load_main_index(out_dir=out_dir)

    if index_only:
@ -586,13 +589,13 @@ def add(urls: Union[str, List[str]],
    if extractors:
        archive_kwargs["methods"] = extractors
    if update_all:
-        archive_links(all_links, overwrite=overwrite, **archive_kwargs)
+        archive_snapshots(all_snapshots, overwrite=overwrite, **archive_kwargs)
    elif overwrite:
-        archive_links(imported_links, overwrite=True, **archive_kwargs)
-    elif new_links:
-        archive_links(new_links, overwrite=False, **archive_kwargs)
+        archive_snapshots(imported_snapshots, overwrite=True, **archive_kwargs)
+    elif new_snapshots:
+        archive_snapshots(new_snapshots, overwrite=False, **archive_kwargs)

-    return all_links
+    return all_snapshots

@enforce_types
 def remove(filter_str: Optional[str]=None,
@ -711,7 +714,7 @@ def update(resume: Optional[float]=None,

    if index_only:
        for link in all_links:
-            write_link_details(link, out_dir=out_dir, skip_sql_index=True)
+            write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True)
        index_links(all_links, out_dir=out_dir)
        return all_links
        
@ -733,7 +736,7 @@ def update(resume: Optional[float]=None,
    if extractors:
        archive_kwargs["methods"] = extractors

-    archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
+    archive_snapshots(to_archive, overwrite=overwrite, **archive_kwargs)

    # Step 4: Re-write links index with updated titles, icons, and resources
    all_links = load_main_index(out_dir=out_dir)
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -14,6 +14,8 @@ from typing import IO, Tuple, List, Optional
 from datetime import datetime
 from pathlib import Path 

+from django.db.models import Model
+
 from ..system import atomic_write
 from ..config import (
    ANSI,
@ -84,7 +86,7 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
    

@enforce_types
-def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:
+def parse_snapshots(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Model], str]:
    """parse a list of URLs with their metadata from an 
       RSS feed, bookmarks export, or text file
    """
@ -93,27 +95,27 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li

    timer = TimedProgress(TIMEOUT * 4)
    with open(source_file, 'r', encoding='utf-8') as file:
-        links, parser = run_parser_functions(file, timer, root_url=root_url)
+        snapshots, parser = run_parser_functions(file, timer, root_url=root_url)

    timer.end()
    if parser is None:
        return [], 'Failed to parse'
-    return links, parser
+    return snapshots, parser


-def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:
-    most_links: List[Link] = []
+def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Model], Optional[str]]:
+    most_snapshots: List[Model] = []
    best_parser_name = None

    for parser_name, parser_func in PARSERS:
        try:
-            parsed_links = list(parser_func(to_parse, root_url=root_url))
-            if not parsed_links:
+            parsed_snapshots = list(parser_func(to_parse, root_url=root_url))
+            if not parsed_snapshots:
                raise Exception('no links found')

            # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
-            if len(parsed_links) > len(most_links):
-                most_links = parsed_links
+            if len(parsed_snapshots) > len(most_snapshots):
+                most_snapshots = parsed_snapshots
                best_parser_name = parser_name
                
        except Exception as err:                                                # noqa
@ -125,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None)
            # raise
            pass
    timer.end()
-    return most_links, best_parser_name
+    return most_snapshots, best_parser_name


@enforce_types
--- a/archivebox/parsers/generic_html.py
+++ b/archivebox/parsers/generic_html.py
@ -31,6 +31,7 @@ class HrefParser(HTMLParser):
@enforce_types
 def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
    """Parse Generic HTML for href tags and use only the url (support for title coming later)"""
+    from core.models import Snapshot

    html_file.seek(0)
    for line in html_file:
@ -44,10 +45,10 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
                url = urljoin(root_url, url)
            
            for archivable_url in re.findall(URL_REGEX, url):
-                yield Link(
+                yield Snapshot(
                    url=htmldecode(archivable_url),
                    timestamp=str(datetime.now().timestamp()),
                    title=None,
-                    tags=None,
-                    sources=[html_file.name],
+                    #tags=None,
+                    #sources=[html_file.name],
                )
--- a/archivebox/parsers/generic_txt.py
+++ b/archivebox/parsers/generic_txt.py
@ -18,6 +18,8 @@ from ..util import (
@enforce_types
 def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse raw links from each line in a text file"""
+    # TODO: Check if we should add sources list to the database
+    from core.models import Snapshot

    text_file.seek(0)
    for line in text_file.readlines():
@ -40,22 +42,22 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:

        # otherwise look for anything that looks like a URL in the line
        for url in re.findall(URL_REGEX, line):
-            yield Link(
+            yield Snapshot(
                url=htmldecode(url),
                timestamp=str(datetime.now().timestamp()),
                title=None,
-                tags=None,
-                sources=[text_file.name],
+                #tags=None,
+                #sources=[text_file.name],
            )

            # look inside the URL for any sub-urls, e.g. for archive.org links
            # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
            # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
            for url in re.findall(URL_REGEX, line[1:]):
-                yield Link(
+                yield Snapshot(
                    url=htmldecode(url),
                    timestamp=str(datetime.now().timestamp()),
                    title=None,
-                    tags=None,
-                    sources=[text_file.name],
+                    #tags=None,
+                    #sources=[text_file.name],
                )
--- a/archivebox/search/init.py
+++ b/archivebox/search/init.py
@ -2,7 +2,7 @@ from typing import List, Union
 from pathlib import Path
 from importlib import import_module

-from django.db.models import QuerySet
+from django.db.models import QuerySet, Model

 from archivebox.index.schema import Link
 from archivebox.util import enforce_types
@ -28,24 +28,22 @@ def import_backend():
    return backend

@enforce_types
-def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
+def write_search_index(snapshot: Model, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
    if not indexing_enabled():
        return

    if not skip_text_index and texts:
        from core.models import Snapshot

-        snap = Snapshot.objects.filter(url=link.url).first()
        backend = import_backend()
-        if snap:
-            try:
-                backend.index(snapshot_id=str(snap.id), texts=texts)
-            except Exception as err:
-                stderr()
-                stderr(
-                    f'[X] The search backend threw an exception={err}:',
+        try:
+            backend.index(snapshot_id=str(snapshot.id), texts=texts)
+        except Exception as err:
+            stderr()
+            stderr(
+                f'[X] The search backend threw an exception={err}:',
                color='red',
-                )
+            )

@enforce_types
 def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: