From 1ce6130202bf284aa5bc90bff40ff879c046b94b Mon Sep 17 00:00:00 2001
From: Cristian <cristianvargasvalencia@gmail.com>
Date: Tue, 5 Jan 2021 10:12:26 -0500
Subject: [PATCH] fix: json index was missing `base_url` field

---
 archivebox/core/models.py | 17 +++++------------
 archivebox/index/json.py  |  2 +-
 archivebox/main.py        | 12 ++++++------
 tests/test_add.py         |  4 +++-
 4 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 388e3f70..655a01c5 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -83,7 +83,7 @@ class Snapshot(models.Model):
     updated = models.DateTimeField(null=True, blank=True, db_index=True)
     tags = models.ManyToManyField(Tag)
 
-    keys = ('url', 'timestamp', 'title', 'tags', 'updated')
+    keys = ('id', 'url', 'timestamp', 'title', 'tags', 'updated', 'base_url')
 
     def __repr__(self) -> str:
         title = self.title or '-'
@@ -109,11 +109,14 @@ class Snapshot(models.Model):
 
     def as_json(self, *args) -> dict:
         args = args or self.keys
-        return {
+        output = {
             key: getattr(self, key)
             if key != 'tags' else self.tags_str()
             for key in args
         }
+        if "id" in output.keys():
+            output["id"] = str(output["id"])
+        return output
 
 
     def as_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
@@ -269,16 +272,6 @@ class Snapshot(models.Model):
             })
         return canonical
 
-    def _asdict(self):
-        return {
-            "id": str(self.id),
-            "url": self.url,
-            "timestamp": self.timestamp,
-            "title": self.title,
-            "added": self.added,
-            "updated": self.updated,
-        }
-
     def save_tags(self, tags=()):
         tags_id = []
         for tag in tags:
diff --git a/archivebox/index/json.py b/archivebox/index/json.py
index 332f8132..5c4c34b5 100644
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@@ -87,7 +87,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
     
     out_dir = out_dir or snapshot.snapshot_dir
     path = Path(out_dir) / JSON_INDEX_FILENAME
-    atomic_write(str(path), snapshot._asdict())
+    atomic_write(str(path), snapshot.as_json())
 
 
 @enforce_types
diff --git a/archivebox/main.py b/archivebox/main.py
index 6ab6a374..f1032181 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -353,7 +353,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     orphaned_json_snapshots = {
         snapshot.url: snapshot
         for snapshot in parse_json_main_index(out_dir)
-        if not all_links.filter(url=link.url).exists()
+        if not all_snapshots.filter(url=link.url).exists()
     }
     if orphaned_json_snapshots:
         pending_snapshots.update(orphaned_json_snapshots)
@@ -381,7 +381,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     }
     if invalid_folders:
         print('    {lightyellow}! Skipped adding {} invalid snapshot data directories.{reset}'.format(len(invalid_folders), **ANSI))
-        print('        X ' + '\n        X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
+        print('        X ' + '\n        X '.join(f'{folder} {snapshot}' for folder, snapshot in invalid_folders.items()))
         print()
         print('    {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
         print('        archivebox status')
@@ -394,7 +394,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     if existing_index:
         print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
     else:
-        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
+        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} snapshots).{reset}'.format(len(all_snapshots), **ANSI))
     print()
     print('    {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
     print('        archivebox server  # then visit http://127.0.0.1:8000')
@@ -577,16 +577,16 @@ def add(urls: Union[str, List[str]],
         for new_snapshot in new_snapshots:
             # TODO: Check if we need to add domain to the Snapshot model
             downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
-            new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url)
+            new_snapshots_depth += parse_snapshots_from_source(downloaded_file, root_url=new_snapshot.url)
 
     imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
     new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)
 
     write_main_index(snapshots=new_snapshots, out_dir=out_dir)
-    all_links = load_main_index(out_dir=out_dir)
+    all_snapshots = load_main_index(out_dir=out_dir)
 
     if index_only:
-        return all_links
+        return all_snapshots
 
     # Run the archive methods for each link
     archive_kwargs = {
diff --git a/tests/test_add.py b/tests/test_add.py
index bb15e51b..f35269ae 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -35,6 +35,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     with open(archived_item_path / "index.json", "r") as f:
         output_json = json.load(f)
+
     assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
 
 
@@ -90,4 +91,5 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
 
     assert (archived_item_path / "warc").exists()
-    assert not (archived_item_path / "singlefile.html").exists()
\ No newline at end of file
+    assert not (archived_item_path / "singlefile.html").exists()
+