From 8c4ae73d657055fcdef5da44461e8b126d9f0589 Mon Sep 17 00:00:00 2001
From: Cristian <cristianvargasvalencia@gmail.com>
Date: Wed, 23 Dec 2020 14:51:42 -0500
Subject: [PATCH] refactor: Initial and dirty refactor to replace link with
 snapshot. Barely functional add command

---
 archivebox/core/admin.py           |   2 +-
 archivebox/core/models.py          |  23 ++++-
 archivebox/extractors/__init__.py  |  84 +++++++---------
 archivebox/extractors/title.py     |  23 +++--
 archivebox/index/__init__.py       | 151 +++++++++++++----------------
 archivebox/index/html.py           |   5 +-
 archivebox/index/json.py           |  31 ++++--
 archivebox/index/sql.py            |  46 ++++-----
 archivebox/main.py                 |  51 +++++-----
 archivebox/parsers/__init__.py     |  22 +++--
 archivebox/parsers/generic_html.py |   7 +-
 archivebox/parsers/generic_txt.py  |  14 +--
 archivebox/search/__init__.py      |  20 ++--
 13 files changed, 246 insertions(+), 233 deletions(-)

diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index 832bea38..4eda8b59 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -21,7 +21,7 @@ from util import htmldecode, urldecode, ansi_to_html
 from logging_util import printable_filesize
 from main import add, remove
 from config import OUTPUT_DIR
-from extractors import archive_links
+from extractors import archive_snapshots
 
 # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
 
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 13d75b66..1f799156 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1,6 +1,7 @@
 __package__ = 'archivebox.core'
 
 import uuid
+from pathlib import Path
 
 from django.db import models, transaction
 from django.utils.functional import cached_property
@@ -9,9 +10,10 @@ from django.db.models import Case, When, Value, IntegerField
 
 from ..util import parse_date
 from ..index.schema import Link
-from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
+from ..config import CONFIG
 
-EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
+#EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
+EXTRACTORS = ["title", "wget"]
 STATUS_CHOICES = [
     ("succeeded", "succeeded"),
     ("failed", "failed"),
@@ -89,6 +91,7 @@ class Snapshot(models.Model):
         title = self.title or '-'
         return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
 
+
     @classmethod
     def from_json(cls, info: dict):
         info = {k: v for k, v in info.items() if k in cls.keys}
@@ -133,8 +136,9 @@ class Snapshot(models.Model):
         return self.as_link().base_url
 
     @cached_property
-    def link_dir(self):
-        return self.as_link().link_dir
+    def snapshot_dir(self):
+        from ..config import CONFIG
+        return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
 
     @cached_property
     def archive_path(self):
@@ -158,6 +162,16 @@ class Snapshot(models.Model):
             return self.history['title'][-1].output.strip()
         return None
 
+    def _asdict(self):
+        return {
+            "id": str(self.id),
+            "url": self.url,
+            "timestamp": self.timestamp,
+            "title": self.title,
+            "added": self.added,
+            "updated": self.updated,
+        }
+
     def save_tags(self, tags=()):
         tags_id = []
         for tag in tags:
@@ -168,6 +182,7 @@ class Snapshot(models.Model):
 
 class ArchiveResultManager(models.Manager):
     def indexable(self, sorted: bool = True):
+        from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
         INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
         qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
 
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index a4acef0b..120d116a 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -4,19 +4,20 @@ import os
 from pathlib import Path
 
 from typing import Optional, List, Iterable, Union
-from datetime import datetime
-from django.db.models import QuerySet
 
-from ..index.schema import Link
-from ..index.sql import write_link_to_sql_index
+from datetime import datetime
+from django.db.models import QuerySet, Model
+
+from ..index.sql import write_snapshot_to_index
 from ..index import (
-    load_link_details,
-    write_link_details,
+    load_snapshot_details,
+    write_snapshot_details,
 )
 from ..util import enforce_types
 from ..logging_util import (
     log_archiving_started,
     log_archiving_paused,
+
     log_archiving_finished,
     log_link_archiving_started,
     log_link_archiving_finished,
@@ -67,15 +68,9 @@ def ignore_methods(to_ignore: List[str]):
     return list(methods)
 
 @enforce_types
-def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
+def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Model:
     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
-
-    # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
-    from core.models import Snapshot, ArchiveResult
-    try:
-        snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
-    except Snapshot.DoesNotExist:
-        snapshot = write_link_to_sql_index(link)
+    from core.models import ArchiveResult
 
     ARCHIVE_METHODS = get_default_archive_methods()
     
@@ -85,33 +80,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
             if method[0] in methods
         ]
 
-    out_dir = out_dir or Path(link.link_dir)
+    out_dir = out_dir or Path(snapshot.snapshot_dir)
     try:
         is_new = not Path(out_dir).exists()
         if is_new:
             os.makedirs(out_dir)
+            details = {"history": {}}
+            write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
+        else:
+            details = list(load_snapshot_details(snapshot))
 
-        link = load_link_details(link, out_dir=out_dir)
-        write_link_details(link, out_dir=out_dir, skip_sql_index=False)
-        log_link_archiving_started(link, out_dir, is_new)
-        link = link.overwrite(updated=datetime.now())
+        #log_link_archiving_started(link, out_dir, is_new)
         stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
 
         for method_name, should_run, method_function in ARCHIVE_METHODS:
             try:
-                if method_name not in link.history:
-                    link.history[method_name] = []
+                if method_name not in details["history"]:
+                    details["history"][method_name] = []
 
-                if should_run(link, out_dir) or overwrite:
+                if should_run(snapshot, out_dir) or overwrite:
                     log_archive_method_started(method_name)
 
-                    result = method_function(link=link, out_dir=out_dir)
+                    result = method_function(snapshot=snapshot, out_dir=out_dir)
 
-                    link.history[method_name].append(result)
+                    details["history"][method_name].append(result)
 
                     stats[result.status] += 1
                     log_archive_method_finished(result)
-                    write_search_index(link=link, texts=result.index_texts)
+                    write_search_index(snapshot=snapshot, texts=result.index_texts)
                     ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                                                  output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
 
@@ -121,7 +117,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
             except Exception as e:
                 raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
                     method_name,
-                    link.url,
+                    snapshot.url,
                 )) from e
 
         # print('    ', stats)
@@ -129,17 +125,17 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
         try:
             latest_title = link.history['title'][-1].output.strip()
             if latest_title and len(latest_title) >= len(link.title or ''):
-                link = link.overwrite(title=latest_title)
+                snapshot.title = latest_title
         except Exception:
             pass
 
-        write_link_details(link, out_dir=out_dir, skip_sql_index=False)
+        write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
 
-        log_link_archiving_finished(link, link.link_dir, is_new, stats)
+        log_link_archiving_finished(snapshot, snapshot.snapshot_dir, is_new, stats)
 
     except KeyboardInterrupt:
         try:
-            write_link_details(link, out_dir=link.link_dir)
+            write_snapshot_details(snapshot, out_dir=link.link_dir)
         except:
             pass
         raise
@@ -148,35 +144,29 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
         print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
         raise
 
-    return link
+    return snapshot
 
 @enforce_types
-def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]:
+def archive_snapshots(all_snapshots: Union[QuerySet, List[Model]], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> QuerySet:
 
-    if type(all_links) is QuerySet:
-        num_links: int = all_links.count()
-        get_link = lambda x: x.as_link()
-        all_links = all_links.iterator()
-    else:
-        num_links: int = len(all_links)
-        get_link = lambda x: x
+    all_snapshots = list(all_snapshots)
+    num_snapshots: int = len(all_snapshots)
 
-    if num_links == 0:
+    if num_snapshots == 0:
         return []
 
-    log_archiving_started(num_links)
+    log_archiving_started(num_snapshots)
     idx: int = 0
     try:
-        for link in all_links:
+        for snapshot in all_snapshots:
             idx += 1
-            to_archive = get_link(link)
-            archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))
+            archive_snapshot(snapshot, overwrite=overwrite, methods=methods, out_dir=Path(snapshot.snapshot_dir))
     except KeyboardInterrupt:
-        log_archiving_paused(num_links, idx, link.timestamp)
+        log_archiving_paused(num_snapshots, idx, snapshot.timestamp)
         raise SystemExit(0)
     except BaseException:
         print()
         raise
 
-    log_archiving_finished(num_links)
-    return all_links
+    log_archiving_finished(num_snapshots)
+    return all_snapshots
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index 28cb128f..519c5961 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -5,7 +5,9 @@ from html.parser import HTMLParser
 from pathlib import Path
 from typing import Optional
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from django.db.models import Model
+
+from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
 from ..util import (
     enforce_types,
     is_static_file,
@@ -61,12 +63,12 @@ class TitleParser(HTMLParser):
 
 
 @enforce_types
-def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
+def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool:
     # if link already has valid title, skip it
-    if link.title and not link.title.lower().startswith('http'):
+    if snapshot.title and not snapshot.title.lower().startswith('http'):
         return False
 
-    if is_static_file(link.url):
+    if is_static_file(snapshot.url):
         return False
 
     return SAVE_TITLE
@@ -77,7 +79,7 @@ def extract_title_with_regex(html):
     return output
 
 @enforce_types
-def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
     """try to guess the page's title from its content"""
 
     from core.models import Snapshot
@@ -89,12 +91,12 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        link.url,
+        snapshot.url,
     ]
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        html = download_url(link.url, timeout=timeout)
+        html = download_url(snapshot.url, timeout=timeout)
         try:
             # try using relatively strict html parser first
             parser = TitleParser()
@@ -108,10 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
         
         # if title is better than the one in the db, update db with new title
         if isinstance(output, str) and output:
-            if not link.title or len(output) >= len(link.title):
-                Snapshot.objects.filter(url=link.url,
-                                        timestamp=link.timestamp)\
+            if not snapshot.title or len(output) >= len(snapshot.title):
+                Snapshot.objects.filter(url=snapshot.url,
+                                        timestamp=snapshot.timestamp)\
                                 .update(title=output)
+                snapshot.title = output
         else:
             raise ArchiveError('Unable to detect page title')
     except Exception as err:
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 8eab1d38..32af7c1d 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -10,7 +10,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from contextlib import contextmanager
 from urllib.parse import urlparse
-from django.db.models import QuerySet, Q
+from django.db.models import QuerySet, Q, Model
 
 from ..util import (
     scheme,
@@ -39,15 +39,15 @@ from ..logging_util import (
 
 from .schema import Link, ArchiveResult
 from .html import (
-    write_html_link_details,
+    write_html_snapshot_details,
 )
 from .json import (
-    parse_json_link_details, 
-    write_json_link_details,
+    parse_json_snapshot_details, 
+    write_json_snapshot_details,
 )
 from .sql import (
     write_sql_main_index,
-    write_sql_link_details,
+    write_sql_snapshot_details,
 )
 
 from ..search import search_backend_enabled, query_search_index
@@ -55,10 +55,12 @@ from ..search import search_backend_enabled, query_search_index
 ### Link filtering and checking
 
 @enforce_types
-def merge_links(a: Link, b: Link) -> Link:
-    """deterministially merge two links, favoring longer field values over shorter,
+def merge_snapshots(a: Model, b: Model) -> Model:
+    """deterministially merge two snapshots, favoring longer field values over shorter,
     and "cleaner" values over worse ones.
+    TODO: Check if this makes sense with the new setup
     """
+    return a
     assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})'
 
     # longest url wins (because a fuzzy url will always be shorter)
@@ -109,55 +111,55 @@ def merge_links(a: Link, b: Link) -> Link:
             key=lambda result: result.start_ts,
         )))
 
-    return Link(
+    return Snapshot(
         url=url,
         timestamp=timestamp,
         title=title,
         tags=tags,
-        sources=sources,
-        history=history,
+        #sources=sources,
+        #history=history,
     )
 
 
 @enforce_types
-def validate_links(links: Iterable[Link]) -> List[Link]:
+def validate_snapshots(snapshots: List[Model]) -> List[Model]:
     timer = TimedProgress(TIMEOUT * 4)
     try:
-        links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
-        links = sorted_links(links)      # deterministically sort the links based on timestamp, url
-        links = fix_duplicate_links(links)  # merge/dedupe duplicate timestamps & urls
+        snapshots = archivable_snapshots(snapshots)  # remove chrome://, about:, mailto: etc.
+        snapshots = sorted_snapshots(snapshots)      # deterministically sort the links based on timestamp, url
+        snapshots = fix_duplicate_snapshots(snapshots)  # merge/dedupe duplicate timestamps & urls
     finally:
         timer.end()
 
-    return list(links)
+    return list(snapshots)
 
 @enforce_types
-def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
+def archivable_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
     """remove chrome://, about:// or other schemed links that cant be archived"""
-    for link in links:
+    for snapshot in snapshots:
         try:
-            urlparse(link.url)
+            urlparse(snapshot.url)
         except ValueError:
             continue
-        if scheme(link.url) not in ('http', 'https', 'ftp'):
+        if scheme(snapshot.url) not in ('http', 'https', 'ftp'):
             continue
-        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
+        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(snapshot.url):
             continue
 
-        yield link
+        yield snapshot
 
 
 @enforce_types
-def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
+def fix_duplicate_snapshots(sorted_snapshots: Iterable[Model]) -> Iterable[Model]:
     """
     ensures that all non-duplicate links have monotonically increasing timestamps
+    TODO: Review how to do this with the new snapshots refactor
     """
-    # from core.models import Snapshot
-
+    return sorted_snapshots
     unique_urls: OrderedDict[str, Link] = OrderedDict()
 
-    for link in sorted_links:
-        if link.url in unique_urls:
+    for snapshot in sorted_snapshots:
+        if snapshot.url in unique_urls:
             # merge with any other links that share the same url
             link = merge_links(unique_urls[link.url], link)
         unique_urls[link.url] = link
@@ -166,9 +168,9 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
 
 
 @enforce_types
-def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
-    sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
-    return sorted(links, key=sort_func, reverse=True)
+def sorted_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
+    sort_func = lambda snapshot: (snapshot.timestamp.split('.', 1)[0], snapshot.url)
+    return sorted(snapshots, key=sort_func, reverse=True)
 
 
 @enforce_types
@@ -222,14 +224,14 @@ def timed_index_update(out_path: Path):
 
 
 @enforce_types
-def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
+def write_main_index(snapshots: List[Model], out_dir: Path=OUTPUT_DIR) -> None:
     """Writes links to sqlite3 file for a given list of links"""
 
-    log_indexing_process_started(len(links))
+    log_indexing_process_started(len(snapshots))
 
     try:
         with timed_index_update(out_dir / SQL_INDEX_FILENAME):
-            write_sql_main_index(links, out_dir=out_dir)
+            write_sql_main_index(snapshots, out_dir=out_dir)
             os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
 
     except (KeyboardInterrupt, SystemExit):
@@ -244,7 +246,10 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
 
 @enforce_types
 def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
-    """parse and load existing index with any new links from import_path merged in"""
+    """
+    Returns all of the snapshots currently in index
+    """
+    setup_django(out_dir, check_db=True)
     from core.models import Snapshot
     try:
         return Snapshot.objects.all()
@@ -265,88 +270,62 @@ def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
 
 
 @enforce_types
-def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
+def parse_snapshots_from_source(source_path: str, root_url: Optional[str]=None) -> List[Model]:
 
-    from ..parsers import parse_links
+    from ..parsers import parse_snapshots
 
-    new_links: List[Link] = []
+    new_links: List[Model] = []
 
     # parse and validate the import file
-    raw_links, parser_name = parse_links(source_path, root_url=root_url)
-    new_links = validate_links(raw_links)
+    raw_snapshots, parser_name = parse_snapshots(source_path, root_url=root_url)
+    new_snapshots = validate_snapshots(raw_snapshots)
 
     if parser_name:
-        num_parsed = len(raw_links)
+        num_parsed = len(raw_snapshots)
         log_parsing_finished(num_parsed, parser_name)
 
-    return new_links
+    return new_snapshots
 
 @enforce_types
-def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
+def filter_new_urls(snapshots: QuerySet,
+                 new_snapshots: List) -> List:
     """
-    Given a list of in-memory Links, dedupe and merge them with any conflicting Snapshots in the DB.
+    Returns a list of Snapshots corresponding to the urls that were not present in the index
     """
-    unique_urls: OrderedDict[str, Link] = OrderedDict()
+    urls = {snapshot.url: snapshot for snapshot in new_snapshots}
+    filtered_snapshots = snapshots.filter(url__in=urls.keys())
 
-    for link in links:
-        index_link = snapshots.filter(url=link.url)
-        if index_link:
-            link = merge_links(index_link[0].as_link(), link)
-
-        unique_urls[link.url] = link
-
-    return unique_urls.values()
-
-@enforce_types
-def dedupe_links(snapshots: QuerySet,
-                 new_links: List[Link]) -> List[Link]:
-    """
-    The validation of links happened at a different stage. This method will
-    focus on actual deduplication and timestamp fixing.
-    """
+    for found_snapshot in filtered_snapshots:
+        urls.pop(found_snapshot.url)
     
-    # merge existing links in out_dir and new links
-    dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
+    log_deduping_finished(len(urls.keys()))
 
-    new_links = [
-        link for link in new_links
-        if not snapshots.filter(url=link.url).exists()
-    ]
-
-    dedup_links_dict = {link.url: link for link in dedup_links}
-
-    # Replace links in new_links with the dedup version
-    for i in range(len(new_links)):
-        if new_links[i].url in dedup_links_dict.keys():
-            new_links[i] = dedup_links_dict[new_links[i].url]
-    log_deduping_finished(len(new_links))
-
-    return new_links
+    return list(urls.values())
 
 ### Link Details Index
 
 @enforce_types
-def write_link_details(link: Link, out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
-    out_dir = out_dir or link.link_dir
+def write_snapshot_details(snapshot: List[Model], out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
+    out_dir = out_dir or snapshot.snapshot_dir
 
-    write_json_link_details(link, out_dir=out_dir)
-    write_html_link_details(link, out_dir=out_dir)
+    write_json_snapshot_details(snapshot, out_dir=out_dir)
+    #write_html_snapshot_details(snapshot, out_dir=out_dir) TODO: Refactor html code too
     if not skip_sql_index:
-        write_sql_link_details(link)
+        write_sql_snapshot_details(snapshot)
 
 
 @enforce_types
-def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
+def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model:
     """check for an existing link archive in the given directory, 
        and load+merge it into the given link dict
     """
-    out_dir = out_dir or link.link_dir
+    out_dir = out_dir or snapshot.snapshot_dir
 
-    existing_link = parse_json_link_details(out_dir)
-    if existing_link:
-        return merge_links(existing_link, link)
+    existing_snapshot = parse_json_snapshot_details(out_dir)
+    if existing_snapshot:
+        return merge_snapshots(existing_snapshot, snapshot)
 
-    return link
+    return snapshot
 
 
 
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index a62e2c7e..d1bd5ee2 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -5,6 +5,7 @@ from typing import List, Optional, Iterator, Mapping
 from pathlib import Path
 
 from django.utils.html import format_html
+from django.db.models import Model
 from collections import defaultdict
 
 from .schema import Link
@@ -71,8 +72,8 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
 ### Link Details Index
 
 @enforce_types
-def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
-    out_dir = out_dir or link.link_dir
+def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
+    out_dir = out_dir or snapshot.snapshot_dir
 
     rendered_html = link_details_template(link)
     atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
diff --git a/archivebox/index/json.py b/archivebox/index/json.py
index f24b969f..ed4c255d 100644
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@@ -7,6 +7,7 @@ from pathlib import Path
 
 from datetime import datetime
 from typing import List, Optional, Iterator, Any, Union
+from django.db.models import Model
 
 from .schema import Link
 from ..system import atomic_write
@@ -81,16 +82,17 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
 ### Link Details Index
 
 @enforce_types
-def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
-    """write a json file with some info about the link"""
+def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
+    """write a json file with some info about the snapshot"""
     
-    out_dir = out_dir or link.link_dir
+    out_dir = out_dir or snapshot.snapshot_dir
     path = Path(out_dir) / JSON_INDEX_FILENAME
-    atomic_write(str(path), link._asdict(extended=True))
+    print(snapshot._asdict())
+    atomic_write(str(path), snapshot._asdict())
 
 
 @enforce_types
-def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
+def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]:
     """load the json link index from a given directory"""
     existing_index = Path(out_dir) / JSON_INDEX_FILENAME
     if existing_index.exists():
@@ -102,16 +104,31 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
                 pass
     return None
 
+@enforce_types
+def load_snapshot_details(snapshot: Model, out_dir: Path):
+    """
+    Loads the detail from the local json index
+    """
+    existing_index = Path(out_dir) / JSON_INDEX_FILENAME
+    if existing_index.exists():
+        with open(existing_index, 'r', encoding='utf-8') as f:
+            try:
+                return pyjson.load(f)
+            except pyjson.JSONDecodeError:
+                pass
+    return None
+
+
 
 @enforce_types
-def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
+def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]:
     """read through all the archive data folders and return the parsed links"""
 
     for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
         if entry.is_dir(follow_symlinks=True):
             if (Path(entry.path) / 'index.json').exists():
                 try:
-                    link = parse_json_link_details(entry.path)
+                    link = parse_json_snapshot_details(entry.path)
                 except KeyError:
                     link = None
                 if link:
diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py
index 1e99f67c..d32a1468 100644
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -3,8 +3,9 @@ __package__ = 'archivebox.index'
 from io import StringIO
 from pathlib import Path
 from typing import List, Tuple, Iterator
-from django.db.models import QuerySet
+from django.db.models import QuerySet, Model
 from django.db import transaction
+from datetime import datetime
 
 from .schema import Link
 from ..util import enforce_types
@@ -28,21 +29,20 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
         snapshots.delete()
 
 @enforce_types
-def write_link_to_sql_index(link: Link):
+def write_snapshot_to_index(snapshot: Model):
     from core.models import Snapshot
-    info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
-    tags = info.pop("tags")
-    if tags is None:
-        tags = []
-
     try:
-        info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
+        timestamp = Snapshot.objects.get(url=snapshot.url).timestamp
     except Snapshot.DoesNotExist:
-        while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
-            info["timestamp"] = str(float(info["timestamp"]) + 1.0)
+        timestamp = snapshot.timestamp
+        if not timestamp:
+            timestamp = str(datetime.now().timestamp())
+        while Snapshot.objects.filter(timestamp=timestamp).exists():
+            print("the timestamp is: ", timestamp)
+            timestamp = str(float(timestamp) + 1.0)
 
-    snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
-    snapshot.save_tags(tags)
+    snapshot.timestamp = timestamp
+    snapshot.save()
     return snapshot
 
 
@@ -50,27 +50,29 @@ def write_link_to_sql_index(link: Link):
 def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
     with transaction.atomic():
         for link in links:
-            write_link_to_sql_index(link)
+            write_snapshot_to_index(link)
             
 
 @enforce_types
-def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
+def write_sql_snapshot_details(snapshot: Model, out_dir: Path=OUTPUT_DIR) -> None:
     from core.models import Snapshot
 
     with transaction.atomic():
         try:
-            snap = Snapshot.objects.get(url=link.url)
+            snap = Snapshot.objects.get(url=snapshot.url)
         except Snapshot.DoesNotExist:
-            snap = write_link_to_sql_index(link)
-        snap.title = link.title
+            snap = write_snapshot_to_sql_index(snapshot)
+        snap.title = snapshot.title
 
-        tag_set = (
-            set(tag.strip() for tag in (link.tags or '').split(','))
-        )
-        tag_list = list(tag_set) or []
+        # TODO: If there are actual tags, this will break
+        #tag_set = (
+        #    set(tag.strip() for tag in (snapshot.tags.all() or '').split(','))
+        #)
+        #tag_list = list(tag_set) or []
 
         snap.save()
-        snap.save_tags(tag_list)
+        #snap.save_tags(tag_list)
+        return snap
 
 
 
diff --git a/archivebox/main.py b/archivebox/main.py
index eb8cd6a0..71147f59 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -29,8 +29,9 @@ from .util import enforce_types                         # type: ignore
 from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
 from .index import (
     load_main_index,
-    parse_links_from_source,
-    dedupe_links,
+    get_empty_snapshot_queryset,
+    parse_snapshots_from_source,
+    filter_new_urls,
     write_main_index,
     snapshot_filter,
     get_indexed_folders,
@@ -44,11 +45,11 @@ from .index import (
     get_corrupted_folders,
     get_unrecognized_folders,
     fix_invalid_folder_locations,
-    write_link_details,
+    write_snapshot_details,
 )
 from .index.json import (
     parse_json_main_index,
-    parse_json_links_details,
+    parse_json_snapshot_details,
     generate_json_index_from_links,
 )
 from .index.sql import (
@@ -60,7 +61,7 @@ from .index.html import (
     generate_index_from_links,
 )
 from .index.csv import links_to_csv
-from .extractors import archive_links, archive_link, ignore_methods
+from .extractors import archive_snapshots, archive_snapshot, ignore_methods
 from .config import (
     stderr,
     hint,
@@ -538,6 +539,7 @@ def add(urls: Union[str, List[str]],
         extractors: str="",
         out_dir: Path=OUTPUT_DIR) -> List[Link]:
     """Add a new URL or list of URLs to your archive"""
+    from core.models import Snapshot
 
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
 
@@ -549,8 +551,8 @@ def add(urls: Union[str, List[str]],
     # Load list of links from the existing index
     check_data_folder(out_dir=out_dir)
     check_dependencies()
-    new_links: List[Link] = []
-    all_links = load_main_index(out_dir=out_dir)
+    new_snapshots: List[Snapshot] = []
+    all_snapshots = load_main_index(out_dir=out_dir)
 
     log_importing_started(urls=urls, depth=depth, index_only=index_only)
     if isinstance(urls, str):
@@ -560,20 +562,21 @@ def add(urls: Union[str, List[str]],
         # save verbatim args to sources
         write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
     
-    new_links += parse_links_from_source(write_ahead_log, root_url=None)
+    new_snapshots += parse_snapshots_from_source(write_ahead_log, root_url=None)
 
     # If we're going one level deeper, download each link and look for more links
-    new_links_depth = []
-    if new_links and depth == 1:
-        log_crawl_started(new_links)
-        for new_link in new_links:
-            downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
-            new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
+    new_snapshots_depth = []
+    if new_snapshots and depth == 1:
+        log_crawl_started(new_snapshots)
+        for new_snapshot in new_snapshots:
+            # TODO: Check if we need to add domain to the Snapshot model
+            downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
+            new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url)
 
-    imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
-    new_links = dedupe_links(all_links, imported_links)
+    imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
+    new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)
 
-    write_main_index(links=new_links, out_dir=out_dir)
+    write_main_index(snapshots=new_snapshots, out_dir=out_dir)
     all_links = load_main_index(out_dir=out_dir)
 
     if index_only:
@@ -586,13 +589,13 @@ def add(urls: Union[str, List[str]],
     if extractors:
         archive_kwargs["methods"] = extractors
     if update_all:
-        archive_links(all_links, overwrite=overwrite, **archive_kwargs)
+        archive_snapshots(all_snapshots, overwrite=overwrite, **archive_kwargs)
     elif overwrite:
-        archive_links(imported_links, overwrite=True, **archive_kwargs)
-    elif new_links:
-        archive_links(new_links, overwrite=False, **archive_kwargs)
+        archive_snapshots(imported_snapshots, overwrite=True, **archive_kwargs)
+    elif new_snapshots:
+        archive_snapshots(new_snapshots, overwrite=False, **archive_kwargs)
 
-    return all_links
+    return all_snapshots
 
 @enforce_types
 def remove(filter_str: Optional[str]=None,
@@ -711,7 +714,7 @@ def update(resume: Optional[float]=None,
 
     if index_only:
         for link in all_links:
-            write_link_details(link, out_dir=out_dir, skip_sql_index=True)
+            write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True)
         index_links(all_links, out_dir=out_dir)
         return all_links
         
@@ -733,7 +736,7 @@ def update(resume: Optional[float]=None,
     if extractors:
         archive_kwargs["methods"] = extractors
 
-    archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
+    archive_snapshots(to_archive, overwrite=overwrite, **archive_kwargs)
 
     # Step 4: Re-write links index with updated titles, icons, and resources
     all_links = load_main_index(out_dir=out_dir)
diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py
index 441c08ac..8b10d794 100644
--- a/archivebox/parsers/__init__.py
+++ b/archivebox/parsers/__init__.py
@@ -14,6 +14,8 @@ from typing import IO, Tuple, List, Optional
 from datetime import datetime
 from pathlib import Path 
 
+from django.db.models import Model
+
 from ..system import atomic_write
 from ..config import (
     ANSI,
@@ -84,7 +86,7 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
     
 
 @enforce_types
-def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:
+def parse_snapshots(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Model], str]:
     """parse a list of URLs with their metadata from an 
        RSS feed, bookmarks export, or text file
     """
@@ -93,27 +95,27 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
 
     timer = TimedProgress(TIMEOUT * 4)
     with open(source_file, 'r', encoding='utf-8') as file:
-        links, parser = run_parser_functions(file, timer, root_url=root_url)
+        snapshots, parser = run_parser_functions(file, timer, root_url=root_url)
 
     timer.end()
     if parser is None:
         return [], 'Failed to parse'
-    return links, parser
+    return snapshots, parser
 
 
-def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:
-    most_links: List[Link] = []
+def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Model], Optional[str]]:
+    most_snapshots: List[Model] = []
     best_parser_name = None
 
     for parser_name, parser_func in PARSERS:
         try:
-            parsed_links = list(parser_func(to_parse, root_url=root_url))
-            if not parsed_links:
+            parsed_snapshots = list(parser_func(to_parse, root_url=root_url))
+            if not parsed_snapshots:
                 raise Exception('no links found')
 
             # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
-            if len(parsed_links) > len(most_links):
-                most_links = parsed_links
+            if len(parsed_snapshots) > len(most_snapshots):
+                most_snapshots = parsed_snapshots
                 best_parser_name = parser_name
                 
         except Exception as err:                                                # noqa
@@ -125,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None)
             # raise
             pass
     timer.end()
-    return most_links, best_parser_name
+    return most_snapshots, best_parser_name
 
 
 @enforce_types
diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py
index 74b3d1fc..99aebf5c 100644
--- a/archivebox/parsers/generic_html.py
+++ b/archivebox/parsers/generic_html.py
@@ -31,6 +31,7 @@ class HrefParser(HTMLParser):
 @enforce_types
 def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
     """Parse Generic HTML for href tags and use only the url (support for title coming later)"""
+    from core.models import Snapshot
 
     html_file.seek(0)
     for line in html_file:
@@ -44,10 +45,10 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
                 url = urljoin(root_url, url)
             
             for archivable_url in re.findall(URL_REGEX, url):
-                yield Link(
+                yield Snapshot(
                     url=htmldecode(archivable_url),
                     timestamp=str(datetime.now().timestamp()),
                     title=None,
-                    tags=None,
-                    sources=[html_file.name],
+                    #tags=None,
+                    #sources=[html_file.name],
                 )
diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py
index e296ec7e..616f226f 100644
--- a/archivebox/parsers/generic_txt.py
+++ b/archivebox/parsers/generic_txt.py
@@ -18,6 +18,8 @@ from ..util import (
 @enforce_types
 def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
     """Parse raw links from each line in a text file"""
+    # TODO: Check if we should add sources list to the database
+    from core.models import Snapshot
 
     text_file.seek(0)
     for line in text_file.readlines():
@@ -40,22 +42,22 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
 
         # otherwise look for anything that looks like a URL in the line
         for url in re.findall(URL_REGEX, line):
-            yield Link(
+            yield Snapshot(
                 url=htmldecode(url),
                 timestamp=str(datetime.now().timestamp()),
                 title=None,
-                tags=None,
-                sources=[text_file.name],
+                #tags=None,
+                #sources=[text_file.name],
             )
 
             # look inside the URL for any sub-urls, e.g. for archive.org links
             # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
             # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
             for url in re.findall(URL_REGEX, line[1:]):
-                yield Link(
+                yield Snapshot(
                     url=htmldecode(url),
                     timestamp=str(datetime.now().timestamp()),
                     title=None,
-                    tags=None,
-                    sources=[text_file.name],
+                    #tags=None,
+                    #sources=[text_file.name],
                 )
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 6191ede9..d958f324 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -2,7 +2,7 @@ from typing import List, Union
 from pathlib import Path
 from importlib import import_module
 
-from django.db.models import QuerySet
+from django.db.models import QuerySet, Model
 
 from archivebox.index.schema import Link
 from archivebox.util import enforce_types
@@ -28,24 +28,22 @@ def import_backend():
     return backend
 
 @enforce_types
-def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
+def write_search_index(snapshot: Model, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
     if not indexing_enabled():
         return
 
     if not skip_text_index and texts:
         from core.models import Snapshot
 
-        snap = Snapshot.objects.filter(url=link.url).first()
         backend = import_backend()
-        if snap:
-            try:
-                backend.index(snapshot_id=str(snap.id), texts=texts)
-            except Exception as err:
-                stderr()
-                stderr(
-                    f'[X] The search backend threw an exception={err}:',
+        try:
+            backend.index(snapshot_id=str(snapshot.id), texts=texts)
+        except Exception as err:
+            stderr()
+            stderr(
+                f'[X] The search backend threw an exception={err}:',
                 color='red',
-                )
+            )
 
 @enforce_types
 def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: