From d92083b928173ccf75918ff2c7ace2471b8d9cd3 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 30 Dec 2020 12:25:32 -0500 Subject: [PATCH] refactor: update command is functional --- archivebox/extractors/__init__.py | 2 +- archivebox/index/__init__.py | 158 ++++++++++++++---------------- archivebox/index/json.py | 9 +- archivebox/main.py | 21 ++-- 4 files changed, 91 insertions(+), 99 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 120d116a..bab4a315 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -88,7 +88,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I details = {"history": {}} write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False) else: - details = list(load_snapshot_details(snapshot)) + details = load_snapshot_details(snapshot) #log_link_archiving_started(link, out_dir, is_new) stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index b4c37845..be5ae783 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -42,6 +42,7 @@ from .html import ( write_html_snapshot_details, ) from .json import ( + load_json_snapshot_details, parse_json_snapshot_details, write_json_snapshot_details, ) @@ -318,9 +319,9 @@ def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model """check for an existing link archive in the given directory, and load+merge it into the given link dict """ - out_dir = out_dir or snapshot.snapshot_dir + out_dir = out_dir or Path(snapshot.snapshot_dir) - existing_snapshot = parse_json_snapshot_details(out_dir) + existing_snapshot = load_json_snapshot_details(out_dir) if existing_snapshot: return merge_snapshots(existing_snapshot, snapshot) @@ -379,56 +380,41 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type return search_filter(snapshots, filter_patterns, filter_type) -def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """indexed links without checking archive status or data directory validity""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] - return { - link.link_dir: link - for link in links - } + return {snapshot.snapshot_dir: snapshot for snapshot in snapshots} -def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """indexed links that are archived with a valid data directory""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] - return { - link.link_dir: link - for link in filter(is_archived, links) - } + return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_archived, snapshots)} -def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """indexed links that are unarchived with no data directory or an empty data directory""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] - return { - link.link_dir: link - for link in filter(is_unarchived, links) - } + return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_unarchived, snapshots)} -def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """dirs that actually exist in the archive/ folder""" + from core.models import Snapshot all_folders = {} for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir(): if entry.is_dir(): - link = None + snapshot = None try: - link = parse_json_link_details(entry.path) + snapshot = parse_json_snapshot_details(entry.path) except Exception: pass - all_folders[entry.name] = link + all_folders[entry.name] = snapshot return all_folders -def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """dirs with a valid index matched to the main index and archived content""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] - return { - link.link_dir: link - for link in filter(is_valid, links) - } + return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_valid, snapshots)} -def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR) orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR) @@ -437,7 +423,7 @@ def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option return {**duplicate, **orphaned, **corrupted, **unrecognized} -def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """dirs that conflict with other directories that have the same link URL or timestamp""" by_url = {} by_timestamp = {} @@ -450,91 +436,92 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti ) for path in chain(snapshots.iterator(), data_folders): - link = None + snapshot = None if type(path) is not str: - path = path.as_link().link_dir + path = path.snapshot_dir try: - link = parse_json_link_details(path) + snapshot = parse_json_snapshot_details(path) except Exception: pass - if link: - # link folder has same timestamp as different link folder - by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1 - if by_timestamp[link.timestamp] > 1: - duplicate_folders[path] = link + if snapshot: + # snapshot folder has same timestamp as different link folder + by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1 + if by_timestamp[snapshot.timestamp] > 1: + duplicate_folders[path] = snapshot # link folder has same url as different link folder - by_url[link.url] = by_url.get(link.url, 0) + 1 - if by_url[link.url] > 1: - duplicate_folders[path] = link + by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1 + if by_url[snapshot.url] > 1: + duplicate_folders[path] = snapshot return duplicate_folders -def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """dirs that contain a valid index but aren't listed in the main index""" orphaned_folders = {} for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): if entry.is_dir(): - link = None + snapshot = None try: - link = parse_json_link_details(str(entry)) + snapshot = parse_json_snapshot_details(str(entry)) except Exception: pass - if link and not snapshots.filter(timestamp=entry.name).exists(): + if snapshot and not snapshots.filter(timestamp=entry.name).exists(): # folder is a valid link data dir with index details, but it's not in the main index - orphaned_folders[str(entry)] = link + orphaned_folders[str(entry)] = snapshot return orphaned_folders -def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """dirs that don't contain a valid index and aren't listed in the main index""" corrupted = {} for snapshot in snapshots.iterator(): - link = snapshot.as_link() - if is_corrupt(link): - corrupted[link.link_dir] = link + if is_corrupt(snapshot): + corrupted[snapshot.snapshot_dir] = snapshot return corrupted -def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: """dirs that don't contain recognizable archive data and aren't listed in the main index""" - unrecognized_folders: Dict[str, Optional[Link]] = {} + unrecognized_folders: Dict[str, Optional[Model]] = {} for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): if entry.is_dir(): index_exists = (entry / "index.json").exists() - link = None + snapshot = None try: - link = parse_json_link_details(str(entry)) + snapshot = parse_json_snapshot_details(str(entry)) except KeyError: # Try to fix index if index_exists: - try: + pass + # TODO: Implement the `guess` bit for snapshots + # try: # Last attempt to repair the detail index - link_guessed = parse_json_link_details(str(entry), guess=True) - write_json_link_details(link_guessed, out_dir=str(entry)) - link = parse_json_link_details(str(entry)) - except Exception: - pass + # link_guessed = parse_json_snapshot_details(str(entry), guess=True) + # write_json_snapshot_details(link_guessed, out_dir=str(entry)) + # link = parse_json_link_details(str(entry)) + # except Exception: + # pass - if index_exists and link is None: + if index_exists and snapshot is None: # index exists but it's corrupted or unparseable - unrecognized_folders[str(entry)] = link + unrecognized_folders[str(entry)] = snapshot elif not index_exists: # link details index doesn't exist and the folder isn't in the main index timestamp = entry.name if not snapshots.filter(timestamp=timestamp).exists(): - unrecognized_folders[str(entry)] = link + unrecognized_folders[str(entry)] = snapshot return unrecognized_folders -def is_valid(link: Link) -> bool: - dir_exists = Path(link.link_dir).exists() - index_exists = (Path(link.link_dir) / "index.json").exists() +def is_valid(snapshot: Model) -> bool: + dir_exists = Path(snapshot.snapshot_dir).exists() + index_exists = (Path(snapshot.snapshot_dir) / "index.json").exists() if not dir_exists: # unarchived links are not included in the valid list return False @@ -542,29 +529,30 @@ def is_valid(link: Link) -> bool: return False if dir_exists and index_exists: try: - parsed_link = parse_json_link_details(link.link_dir, guess=True) - return link.url == parsed_link.url + # TODO: review if the `guess` was necessary here + parsed_snapshot = parse_json_snapshot_details(snapshot.snapshot_dir) + return snapshot.url == parsed_snapshot.url except Exception: pass return False -def is_corrupt(link: Link) -> bool: - if not Path(link.link_dir).exists(): +def is_corrupt(snapshot: Model) -> bool: + if not Path(snapshot.snapshot_dir).exists(): # unarchived links are not considered corrupt return False - if is_valid(link): + if is_valid(snapshot): return False return True -def is_archived(link: Link) -> bool: - return is_valid(link) and link.is_archived +def is_archived(snapshot: Model) -> bool: + return is_valid(snapshot) and snapshot.is_archived -def is_unarchived(link: Link) -> bool: - if not Path(link.link_dir).exists(): +def is_unarchived(snapshot: Model) -> bool: + if not Path(snapshot.snapshot_dir).exists(): return True - return not link.is_archived + return not snapshot.is_archived def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: @@ -574,22 +562,22 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: - link = parse_json_link_details(entry.path) + snapshot = parse_json_snapshot_details(entry.path) except KeyError: - link = None - if not link: + snapshot = None + if not snapshot: continue if not entry.path.endswith(f'/{link.timestamp}'): - dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp + dest = out_dir / ARCHIVE_DIR_NAME / snapshot.timestamp if dest.exists(): cant_fix.append(entry.path) else: shutil.move(entry.path, dest) fixed.append(dest) timestamp = entry.path.rsplit('/', 1)[-1] - assert link.link_dir == entry.path - assert link.timestamp == timestamp - write_json_link_details(link, out_dir=entry.path) + assert snapshot.snapshot_dir == entry.path + assert snapshot.timestamp == timestamp + write_json_snapshot_details(snapshot, out_dir=entry.path) return fixed, cant_fix diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 6e988b54..135e68f8 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -91,7 +91,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> @enforce_types -def load_snapshot_details(out_Dir: Path) -> Optional[Model]: +def load_json_snapshot_details(out_dir: Path) -> Optional[Model]: """ Loads the detail from the local json index """ @@ -99,7 +99,10 @@ def load_snapshot_details(out_Dir: Path) -> Optional[Model]: if existing_index.exists(): with open(existing_index, 'r', encoding='utf-8') as f: try: - return pyjson.load(f) + output = pyjson.load(f) + if "history" not in output.keys(): + output["history"] = {} + return output except pyjson.JSONDecodeError: pass return None @@ -109,7 +112,7 @@ def load_snapshot_details(out_Dir: Path) -> Optional[Model]: def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]: """read through all the archive data folders and return the parsed links""" - for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): + for entry in os.scandir(Path(out_dir)): if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: diff --git a/archivebox/main.py b/archivebox/main.py index f7dafb5d..bb6d348d 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -9,7 +9,7 @@ from datetime import date from typing import Dict, List, Optional, Iterable, IO, Union from crontab import CronTab, CronSlices -from django.db.models import QuerySet +from django.db.models import QuerySet, Model from .cli import ( list_subcommands, @@ -689,15 +689,16 @@ def update(resume: Optional[float]=None, extractors: str="", out_dir: Path=OUTPUT_DIR) -> List[Link]: """Import any new links from subscriptions and retry any previously failed/skipped links""" + from core.models import Snapshot check_data_folder(out_dir=out_dir) check_dependencies() - new_links: List[Link] = [] # TODO: Remove input argument: only_new + new_links: List[Snapshot] = [] # TODO: Remove input argument: only_new extractors = extractors.split(",") if extractors else [] # Step 1: Filter for selected_links - matching_snapshots = list_links( + matching_snapshots = list_snapshots( filter_patterns=filter_patterns, filter_type=filter_type, before=before, @@ -705,15 +706,15 @@ def update(resume: Optional[float]=None, ) matching_folders = list_folders( - links=matching_snapshots, + snapshots=matching_snapshots, status=status, out_dir=out_dir, ) all_links = [link for link in matching_folders.values() if link] if index_only: - for link in all_links: - write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True) + for snapshot in all_snapshots: + write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=True) index_links(all_links, out_dir=out_dir) return all_links @@ -797,7 +798,7 @@ def list_all(filter_patterns_str: Optional[str]=None, @enforce_types -def list_links(snapshots: Optional[QuerySet]=None, +def list_snapshots(snapshots: Optional[QuerySet]=None, filter_patterns: Optional[List[str]]=None, filter_type: str='exact', after: Optional[float]=None, @@ -820,9 +821,9 @@ def list_links(snapshots: Optional[QuerySet]=None, return all_snapshots @enforce_types -def list_folders(links: List[Link], +def list_folders(snapshots: List[Model], status: str, - out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: + out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]: check_data_folder(out_dir=out_dir) @@ -840,7 +841,7 @@ def list_folders(links: List[Link], } try: - return STATUS_FUNCTIONS[status](links, out_dir=out_dir) + return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir) except KeyError: raise ValueError('Status not recognized.')