diff --git a/archivebox/core/models.py b/archivebox/core/models.py index d0fac3e8..b34c0212 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -138,7 +138,7 @@ class Snapshot(models.Model): @cached_property def snapshot_dir(self): from ..config import CONFIG - return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp) + return Path(CONFIG['ARCHIVE_DIR']) / self.timestamp @cached_property def archive_path(self): @@ -173,6 +173,12 @@ class Snapshot(models.Model): from ..util import is_static_file return is_static_file(self.url) + @cached_property + def details(self) -> Dict: + # TODO: Define what details are, and return them accordingly + return {"history": {}} + + def canonical_outputs(self) -> Dict[str, Optional[str]]: """predict the expected output paths that should be present after archiving""" diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index bab4a315..278706eb 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -88,7 +88,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I details = {"history": {}} write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False) else: - details = load_snapshot_details(snapshot) + details = snapshot.details #log_link_archiving_started(link, out_dir, is_new) stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index be5ae783..fb6ffe7d 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -42,7 +42,7 @@ from .html import ( write_html_snapshot_details, ) from .json import ( - load_json_snapshot_details, + load_json_snapshot, parse_json_snapshot_details, write_json_snapshot_details, ) @@ -441,7 +441,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti path = path.snapshot_dir try: - snapshot = parse_json_snapshot_details(path) + snapshot = load_json_snapshot_details(path) except Exception: pass @@ -530,7 +530,7 @@ def is_valid(snapshot: Model) -> bool: if dir_exists and index_exists: try: # TODO: review if the `guess` was necessary here - parsed_snapshot = parse_json_snapshot_details(snapshot.snapshot_dir) + parsed_snapshot = load_json_snapshot(snapshot.snapshot_dir) return snapshot.url == parsed_snapshot.url except Exception: pass diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 135e68f8..f6e54372 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -91,17 +91,18 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> @enforce_types -def load_json_snapshot_details(out_dir: Path) -> Optional[Model]: +def load_json_snapshot(out_dir: Path) -> Optional[Model]: """ Loads the detail from the local json index """ + from core.models import Snapshot + existing_index = Path(out_dir) / JSON_INDEX_FILENAME if existing_index.exists(): with open(existing_index, 'r', encoding='utf-8') as f: try: output = pyjson.load(f) - if "history" not in output.keys(): - output["history"] = {} + output = Snapshot.from_json(output) return output except pyjson.JSONDecodeError: pass @@ -110,13 +111,13 @@ def load_json_snapshot_details(out_dir: Path) -> Optional[Model]: @enforce_types def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]: - """read through all the archive data folders and return the parsed links""" + """read through all the archive data folders and return the parsed snapshots""" for entry in os.scandir(Path(out_dir)): if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: - snapshot_details = load_snapshot_details(entry.path) + snapshot_details = load_json_snapshot_details(entry.path) except KeyError: snapshot_details = None if snapshot_details: diff --git a/archivebox/main.py b/archivebox/main.py index bb6d348d..443a5d0a 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -427,11 +427,11 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: print(f' Index size: {size} across {num_files} files') print() - links = load_main_index(out_dir=out_dir) - num_sql_links = links.count() - num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) - print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') - print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') + snapshots = load_main_index(out_dir=out_dir) + num_sql_snapshots = snapshots.count() + num_snapshot_details = sum(1 for snapshot in parse_json_snapshot_details(out_dir=out_dir)) + print(f' > SQL Main Index: {num_sql_snapshots} snapshots'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') + print(f' > JSON Link Details: {num_snapshot_details} snapshots'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') print() print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI)) print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset']) @@ -439,23 +439,23 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: size = printable_filesize(num_bytes) print(f' Size: {size} across {num_files} files in {num_dirs} directories') print(ANSI['black']) - num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) - num_archived = len(get_archived_folders(links, out_dir=out_dir)) - num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) + num_indexed = len(get_indexed_folders(snapshots, out_dir=out_dir)) + num_archived = len(get_archived_folders(snapshots, out_dir=out_dir)) + num_unarchived = len(get_unarchived_folders(snapshots, out_dir=out_dir)) print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') - num_present = len(get_present_folders(links, out_dir=out_dir)) - num_valid = len(get_valid_folders(links, out_dir=out_dir)) + num_present = len(get_present_folders(snapshots, out_dir=out_dir)) + num_valid = len(get_valid_folders(snapshots, out_dir=out_dir)) print() print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})') - duplicate = get_duplicate_folders(links, out_dir=out_dir) - orphaned = get_orphaned_folders(links, out_dir=out_dir) - corrupted = get_corrupted_folders(links, out_dir=out_dir) - unrecognized = get_unrecognized_folders(links, out_dir=out_dir) + duplicate = get_duplicate_folders(snapshots, out_dir=out_dir) + orphaned = get_orphaned_folders(snapshots, out_dir=out_dir) + corrupted = get_corrupted_folders(snapshots, out_dir=out_dir) + unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir) num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})') print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') @@ -466,7 +466,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: print(ANSI['reset']) if num_indexed: - print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI)) + print(' {lightred}Hint:{reset} You can list snapshot data directories by status like so:'.format(**ANSI)) print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)') if orphaned: @@ -495,7 +495,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: print(' archivebox manage createsuperuser') print() - for snapshot in links.order_by('-updated')[:10]: + for snapshot in snapshots.order_by('-updated')[:10]: if not snapshot.updated: continue print(