From 30c886d4d46617e5f16175edcf795043110c5051 Mon Sep 17 00:00:00 2001 From: Cristian Date: Sat, 16 Jan 2021 14:11:45 -0500 Subject: [PATCH] fix: Overwrite(add command) was using snapshots without timestamps (and in memory), causing issues with some extractors --- archivebox/core/models.py | 1 + archivebox/extractors/__init__.py | 2 +- archivebox/main.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 5bf2a25b..a225f4d7 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -106,6 +106,7 @@ class Snapshot(models.Model): if "tags" in info: # TODO: Handle tags info.pop("tags") + info.pop("base_url", None) return cls(**info) def get_history(self) -> dict: diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 4c6e6a67..ea12faec 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -114,7 +114,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I # print('{black} X {}{reset}'.format(method_name, **ANSI)) stats['skipped'] += 1 except Exception as e: - raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( + raise Exception('Exception in archive_methods.save_{}(Snapshot(url={}))'.format( method_name, snapshot.url, )) from e diff --git a/archivebox/main.py b/archivebox/main.py index f2ac951c..1daf64cf 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -597,6 +597,7 @@ def add(urls: Union[str, List[str]], if update_all: archive_snapshots(all_snapshots, overwrite=overwrite, **archive_kwargs) elif overwrite: + imported_snapshots = Snapshot.objects.filter(url__in=[imported_snapshot.url for imported_snapshot in imported_snapshots]) archive_snapshots(imported_snapshots, overwrite=True, **archive_kwargs) elif new_snapshots: archive_snapshots(new_snapshots, overwrite=False, **archive_kwargs)