diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 32af7c1d..b4c37845 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -249,7 +249,6 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: """ Returns all of the snapshots currently in index """ - setup_django(out_dir, check_db=True) from core.models import Snapshot try: return Snapshot.objects.all() diff --git a/archivebox/index/json.py b/archivebox/index/json.py index ed4c255d..2527944d 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -92,20 +92,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> @enforce_types -def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]: - """load the json link index from a given directory""" - existing_index = Path(out_dir) / JSON_INDEX_FILENAME - if existing_index.exists(): - with open(existing_index, 'r', encoding='utf-8') as f: - try: - link_json = pyjson.load(f) - return Link.from_json(link_json, guess) - except pyjson.JSONDecodeError: - pass - return None - -@enforce_types -def load_snapshot_details(snapshot: Model, out_dir: Path): +def load_snapshot_details(out_Dir: Path) -> Optional[Model]: """ Loads the detail from the local json index """ @@ -119,20 +106,19 @@ def load_snapshot_details(snapshot: Model, out_dir: Path): return None - @enforce_types -def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]: +def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]: """read through all the archive data folders and return the parsed links""" for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: - link = parse_json_snapshot_details(entry.path) + snapshot_details = load_snapshot_details(entry.path) except KeyError: - link = None - if link: - yield link + snapshot_details = None + if snapshot_details: + yield snapshot_details diff --git a/archivebox/main.py b/archivebox/main.py index 71147f59..f7dafb5d 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -29,7 +29,6 @@ from .util import enforce_types # type: ignore from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .index import ( load_main_index, - get_empty_snapshot_queryset, parse_snapshots_from_source, filter_new_urls, write_main_index, @@ -340,8 +339,8 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: pending_links: Dict[str, Link] = {} if existing_index: - all_links = load_main_index(out_dir=out_dir, warn=False) - print(' √ Loaded {} links from existing main index.'.format(all_links.count())) + all_snapshots = load_main_index(out_dir=out_dir, warn=False) + print(' √ Loaded {} snapshots from existing main index.'.format(all_snapshots.count())) # Links in data folders that dont match their timestamp fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) @@ -361,22 +360,22 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) # Links in data dir indexes but not in main index - orphaned_data_dir_links = { - link.url: link - for link in parse_json_links_details(out_dir) - if not all_links.filter(url=link.url).exists() + orphaned_data_dir_snapshots = { + snapshot.url: snapshot + for snapshot in parse_json_snapshot_details(out_dir) + if not all_snapshots.filter(url=link.url).exists() } - if orphaned_data_dir_links: - pending_links.update(orphaned_data_dir_links) - print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) + if orphaned_data_dir_snapshots: + pending_snapshots.update(orphaned_data_dir_links) + print(' {lightyellow}√ Added {} orphaned snapshots from existing archive directories.{reset}'.format(len(orphaned_data_dir_snapshots), **ANSI)) # Links in invalid/duplicate data dirs invalid_folders = { - folder: link - for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() + folder: snapshot + for folder, snapshot in get_invalid_folders(all_snapshots, out_dir=out_dir).items() } if invalid_folders: - print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) + print(' {lightyellow}! Skipped adding {} invalid snapshot data directories.{reset}'.format(len(invalid_folders), **ANSI)) print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items())) print() print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))