From 9fdcb9857ec58aa309be74aef0e7d723f92d3527 Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 31 Dec 2020 13:21:40 -0500 Subject: [PATCH] refactor: remove command functional --- archivebox/index/__init__.py | 15 +++++++-------- archivebox/logging_util.py | 30 +++++++++++++++--------------- archivebox/main.py | 19 +++++++++---------- 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index fb6ffe7d..aff1d36b 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -43,7 +43,6 @@ from .html import ( ) from .json import ( load_json_snapshot, - parse_json_snapshot_details, write_json_snapshot_details, ) from .sql import ( @@ -321,7 +320,7 @@ def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model """ out_dir = out_dir or Path(snapshot.snapshot_dir) - existing_snapshot = load_json_snapshot_details(out_dir) + existing_snapshot = load_json_snapshot_details(Path(out_dir)) if existing_snapshot: return merge_snapshots(existing_snapshot, snapshot) @@ -402,7 +401,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option if entry.is_dir(): snapshot = None try: - snapshot = parse_json_snapshot_details(entry.path) + snapshot = load_json_snapshot(Path(entry.path)) except Exception: pass @@ -441,7 +440,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti path = path.snapshot_dir try: - snapshot = load_json_snapshot_details(path) + snapshot = load_json_snapshot(Path(path)) except Exception: pass @@ -465,7 +464,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio if entry.is_dir(): snapshot = None try: - snapshot = parse_json_snapshot_details(str(entry)) + snapshot = load_json_snapshot(str(entry)) except Exception: pass @@ -492,7 +491,7 @@ def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, O index_exists = (entry / "index.json").exists() snapshot = None try: - snapshot = parse_json_snapshot_details(str(entry)) + snapshot = load_json_snapshot(entry) except KeyError: # Try to fix index if index_exists: @@ -562,13 +561,13 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: - snapshot = parse_json_snapshot_details(entry.path) + snapshot = load_json_snapshot(Path(entry.path)) except KeyError: snapshot = None if not snapshot: continue - if not entry.path.endswith(f'/{link.timestamp}'): + if not entry.path.endswith(f'/{snapshot.timestamp}'): dest = out_dir / ARCHIVE_DIR_NAME / snapshot.timestamp if dest.exists(): cant_fix.append(entry.path) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index f2b86735..2f564e6b 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -395,49 +395,49 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str): )) print(' {}'.format(' '.join(filter_patterns or ()))) -def log_list_finished(links): - from .index.csv import links_to_csv +def log_list_finished(snapshots): + from .index.csv import snapshots_to_csv print() print('---------------------------------------------------------------------------------------------------') - print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) + print(snapshots_to_csv(snapshots, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) print('---------------------------------------------------------------------------------------------------') print() -def log_removal_started(links: List["Link"], yes: bool, delete: bool): - print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI)) +def log_removal_started(snapshots: List["Snapshot"], yes: bool, delete: bool): + print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(snapshots), **ANSI)) if delete: - file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()] + file_counts = [snapshot.num_outputs for snapshot in snapshots if Path(snapshot.snapshot_dir).exists()] print( - f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' + f' {len(snapshots)} Snapshots will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)' ) else: print( - ' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' + ' Matching snapshots will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' ' (Pass --delete if you also want to permanently delete the data folders)' ) if not yes: print() - print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI)) + print('{lightyellow}[?] Do you want to proceed with removing these {} snapshots?{reset}'.format(len(snapshots), **ANSI)) try: assert input(' y/[n]: ').lower() == 'y' except (KeyboardInterrupt, EOFError, AssertionError): raise SystemExit(0) -def log_removal_finished(all_links: int, to_remove: int): - if all_links == 0: +def log_removal_finished(all_snapshots: int, to_remove: int): + if to_remove == 0: print() - print('{red}[X] No matching links found.{reset}'.format(**ANSI)) + print('{red}[X] No matching snapshots found.{reset}'.format(**ANSI)) else: print() - print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format( + print('{red}[√] Removed {} out of {} snapshots from the archive index.{reset}'.format( to_remove, - all_links, + all_snapshots, **ANSI, )) - print(' Index now contains {} links.'.format(all_links - to_remove)) + print(' Index now contains {} snapshots.'.format(all_snapshots - to_remove)) def log_shell_welcome_msg(): diff --git a/archivebox/main.py b/archivebox/main.py index f33fabd8..5dc9484f 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -336,7 +336,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI)) all_links = Snapshot.objects.none() - pending_links: Dict[str, Link] = {} + pending_snapshots: Dict[str, Link] = {} if existing_index: all_snapshots = load_main_index(out_dir=out_dir, warn=False) @@ -363,10 +363,10 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: orphaned_data_dir_snapshots = { snapshot.url: snapshot for snapshot in parse_json_snapshot_details(out_dir) - if not all_snapshots.filter(url=link.url).exists() + if not all_snapshots.filter(url=snapshot.url).exists() } if orphaned_data_dir_snapshots: - pending_snapshots.update(orphaned_data_dir_links) + pending_snapshots.update(orphaned_data_dir_snapshots) print(' {lightyellow}√ Added {} orphaned snapshots from existing archive directories.{reset}'.format(len(orphaned_data_dir_snapshots), **ANSI)) # Links in invalid/duplicate data dirs @@ -383,7 +383,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print(' archivebox list --status=invalid') - write_main_index(list(pending_links.values()), out_dir=out_dir) + write_main_index(list(pending_snapshots.values()), out_dir=out_dir) print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) if existing_index: @@ -656,24 +656,23 @@ def remove(filter_str: Optional[str]=None, raise SystemExit(1) - log_links = [link.as_link() for link in snapshots] - log_list_finished(log_links) - log_removal_started(log_links, yes=yes, delete=delete) + log_list_finished(snapshots) + log_removal_started(snapshots, yes=yes, delete=delete) timer = TimedProgress(360, prefix=' ') try: for snapshot in snapshots: if delete: - shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True) + shutil.rmtree(snapshot.snapshot_dir, ignore_errors=True) finally: timer.end() to_remove = snapshots.count() + all_snapshots = load_main_index(out_dir=out_dir).count() flush_search_index(snapshots=snapshots) remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) - all_snapshots = load_main_index(out_dir=out_dir) - log_removal_finished(all_snapshots.count(), to_remove) + log_removal_finished(all_snapshots, to_remove) return all_snapshots