From 1ce6130202bf284aa5bc90bff40ff879c046b94b Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 5 Jan 2021 10:12:26 -0500 Subject: [PATCH] fix: json index was missing `base_url` field --- archivebox/core/models.py | 17 +++++------------ archivebox/index/json.py | 2 +- archivebox/main.py | 12 ++++++------ tests/test_add.py | 4 +++- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 388e3f70..655a01c5 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -83,7 +83,7 @@ class Snapshot(models.Model): updated = models.DateTimeField(null=True, blank=True, db_index=True) tags = models.ManyToManyField(Tag) - keys = ('url', 'timestamp', 'title', 'tags', 'updated') + keys = ('id', 'url', 'timestamp', 'title', 'tags', 'updated', 'base_url') def __repr__(self) -> str: title = self.title or '-' @@ -109,11 +109,14 @@ class Snapshot(models.Model): def as_json(self, *args) -> dict: args = args or self.keys - return { + output = { key: getattr(self, key) if key != 'tags' else self.tags_str() for key in args } + if "id" in output.keys(): + output["id"] = str(output["id"]) + return output def as_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str: @@ -269,16 +272,6 @@ class Snapshot(models.Model): }) return canonical - def _asdict(self): - return { - "id": str(self.id), - "url": self.url, - "timestamp": self.timestamp, - "title": self.title, - "added": self.added, - "updated": self.updated, - } - def save_tags(self, tags=()): tags_id = [] for tag in tags: diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 332f8132..5c4c34b5 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -87,7 +87,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> out_dir = out_dir or snapshot.snapshot_dir path = Path(out_dir) / JSON_INDEX_FILENAME - atomic_write(str(path), snapshot._asdict()) + atomic_write(str(path), snapshot.as_json()) @enforce_types diff --git a/archivebox/main.py b/archivebox/main.py index 6ab6a374..f1032181 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -353,7 +353,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: orphaned_json_snapshots = { snapshot.url: snapshot for snapshot in parse_json_main_index(out_dir) - if not all_links.filter(url=link.url).exists() + if not all_snapshots.filter(url=link.url).exists() } if orphaned_json_snapshots: pending_snapshots.update(orphaned_json_snapshots) @@ -381,7 +381,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: } if invalid_folders: print(' {lightyellow}! Skipped adding {} invalid snapshot data directories.{reset}'.format(len(invalid_folders), **ANSI)) - print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) + print(' X ' + '\n X '.join(f'{folder} {snapshot}' for folder, snapshot in invalid_folders.items())) print() print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) print(' archivebox status') @@ -394,7 +394,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: if existing_index: print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) else: - print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI)) + print('{green}[√] Done. A new ArchiveBox collection was initialized ({} snapshots).{reset}'.format(len(all_snapshots), **ANSI)) print() print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI)) print(' archivebox server # then visit http://127.0.0.1:8000') @@ -577,16 +577,16 @@ def add(urls: Union[str, List[str]], for new_snapshot in new_snapshots: # TODO: Check if we need to add domain to the Snapshot model downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir) - new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url) + new_snapshots_depth += parse_snapshots_from_source(downloaded_file, root_url=new_snapshot.url) imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth] new_snapshots = filter_new_urls(all_snapshots, imported_snapshots) write_main_index(snapshots=new_snapshots, out_dir=out_dir) - all_links = load_main_index(out_dir=out_dir) + all_snapshots = load_main_index(out_dir=out_dir) if index_only: - return all_links + return all_snapshots # Run the archive methods for each link archive_kwargs = { diff --git a/tests/test_add.py b/tests/test_add.py index bb15e51b..f35269ae 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -35,6 +35,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) + assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html" @@ -90,4 +91,5 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process): archived_item_path = list(tmp_path.glob('archive/**/*'))[0] assert (archived_item_path / "warc").exists() - assert not (archived_item_path / "singlefile.html").exists() \ No newline at end of file + assert not (archived_item_path / "singlefile.html").exists() +