From ea84607b471f095f43e01cd95c33899345f96b4d Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 1 Jan 2021 13:58:55 -0500 Subject: [PATCH] fix: Init and status commands now are able to navigate the right archive folder --- archivebox/index/__init__.py | 2 +- archivebox/index/json.py | 2 +- archivebox/main.py | 21 +++++++++++++-------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index aff1d36b..8a637fdb 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -464,7 +464,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio if entry.is_dir(): snapshot = None try: - snapshot = load_json_snapshot(str(entry)) + snapshot = load_json_snapshot(entry) except Exception: pass diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 799320e7..332f8132 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -113,7 +113,7 @@ def load_json_snapshot(out_dir: Path) -> Optional[Model]: def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]: """read through all the archive data folders and return the parsed snapshots""" - for entry in os.scandir(Path(out_dir)): + for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: diff --git a/archivebox/main.py b/archivebox/main.py index 5dc9484f..6ab6a374 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -335,8 +335,8 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print() print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI)) - all_links = Snapshot.objects.none() - pending_snapshots: Dict[str, Link] = {} + all_snapshots = Snapshot.objects.none() + pending_snapshots: Dict[str, Snapshot] = {} if existing_index: all_snapshots = load_main_index(out_dir=out_dir, warn=False) @@ -350,14 +350,14 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) # Links in JSON index but not in main index - orphaned_json_links = { - link.url: link - for link in parse_json_main_index(out_dir) + orphaned_json_snapshots = { + snapshot.url: snapshot + for snapshot in parse_json_main_index(out_dir) if not all_links.filter(url=link.url).exists() } - if orphaned_json_links: - pending_links.update(orphaned_json_links) - print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) + if orphaned_json_snapshots: + pending_snapshots.update(orphaned_json_snapshots) + print(' {lightyellow}√ Added {} orphaned snapshots from deprecated JSON index...{reset}'.format(len(orphaned_json_snapshots), **ANSI)) # Links in data dir indexes but not in main index orphaned_data_dir_snapshots = { @@ -369,6 +369,11 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: pending_snapshots.update(orphaned_data_dir_snapshots) print(' {lightyellow}√ Added {} orphaned snapshots from existing archive directories.{reset}'.format(len(orphaned_data_dir_snapshots), **ANSI)) + + # TODO: Should we remove orphaned folders from the invalid list? With init they are being imported, but the same links that were + # listed as just imported are listed as skipped because they are invalid. At the very least I think we should improve this message, + # because it makes this command a little more confusing. + # Links in invalid/duplicate data dirs invalid_folders = { folder: snapshot