refactor: status command is functional

2024-06-28 11:00:35 +12:00 · 2020-12-30 12:53:20 -05:00 · 2020-12-30 12:53:20 -05:00 · 973f8b6abc
parent d92083b928
commit 973f8b6abc
5 changed files with 33 additions and 26 deletions
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -138,7 +138,7 @@ class Snapshot(models.Model):
    @cached_property
    def snapshot_dir(self):
        from ..config import CONFIG
-        return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
+        return Path(CONFIG['ARCHIVE_DIR']) / self.timestamp
    @cached_property
    def archive_path(self):
@ -173,6 +173,12 @@ class Snapshot(models.Model):
        from ..util import is_static_file
        return is_static_file(self.url)
    @cached_property
    def details(self) -> Dict:
        # TODO: Define what details are, and return them accordingly
        return {"history": {}}
    def canonical_outputs(self) -> Dict[str, Optional[str]]:
        """predict the expected output paths that should be present after archiving"""
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -88,7 +88,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I
            details = {"history": {}}
            write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
        else:
-            details = load_snapshot_details(snapshot)
+            details = snapshot.details
        #log_link_archiving_started(link, out_dir, is_new)
        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -42,7 +42,7 @@ from .html import (
    write_html_snapshot_details,
 )
 from .json import (
-    load_json_snapshot_details,
+    load_json_snapshot,
    parse_json_snapshot_details, 
    write_json_snapshot_details,
 )
@ -441,7 +441,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
            path = path.snapshot_dir
        try:
-            snapshot = parse_json_snapshot_details(path)
+            snapshot = load_json_snapshot_details(path)
        except Exception:
            pass
@ -530,7 +530,7 @@ def is_valid(snapshot: Model) -> bool:
    if dir_exists and index_exists:
        try:
            # TODO: review if the `guess` was necessary here
-            parsed_snapshot = parse_json_snapshot_details(snapshot.snapshot_dir)
+            parsed_snapshot = load_json_snapshot(snapshot.snapshot_dir)
            return snapshot.url == parsed_snapshot.url
        except Exception:
            pass
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@ -91,17 +91,18 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
@enforce_types
-def load_json_snapshot_details(out_dir: Path) -> Optional[Model]:
+def load_json_snapshot(out_dir: Path) -> Optional[Model]:
    """
    Loads the detail from the local json index
    """
    from core.models import Snapshot
    existing_index = Path(out_dir) / JSON_INDEX_FILENAME
    if existing_index.exists():
        with open(existing_index, 'r', encoding='utf-8') as f:
            try:
                output = pyjson.load(f)
-                if "history" not in output.keys():
+                output = Snapshot.from_json(output)
                    output["history"] = {}
                return output
            except pyjson.JSONDecodeError:
                pass
@ -110,13 +111,13 @@ def load_json_snapshot_details(out_dir: Path) -> Optional[Model]:
@enforce_types
 def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]:
-    """read through all the archive data folders and return the parsed links"""
+    """read through all the archive data folders and return the parsed snapshots"""
    for entry in os.scandir(Path(out_dir)):
        if entry.is_dir(follow_symlinks=True):
            if (Path(entry.path) / 'index.json').exists():
                try:
-                    snapshot_details = load_snapshot_details(entry.path)
+                    snapshot_details = load_json_snapshot_details(entry.path)
                except KeyError:
                    snapshot_details = None
                if snapshot_details:
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -427,11 +427,11 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
    print(f'    Index size: {size} across {num_files} files')
    print()
-    links = load_main_index(out_dir=out_dir)
+    snapshots = load_main_index(out_dir=out_dir)
-    num_sql_links = links.count()
+    num_sql_snapshots = snapshots.count()
-    num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
+    num_snapshot_details = sum(1 for snapshot in parse_json_snapshot_details(out_dir=out_dir))
-    print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
+    print(f'    > SQL Main Index: {num_sql_snapshots} snapshots'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
-    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
+    print(f'    > JSON Link Details: {num_snapshot_details} snapshots'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
    print()
    print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
    print(ANSI['lightyellow'], f'   {ARCHIVE_DIR}/*', ANSI['reset'])
@ -439,23 +439,23 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
    size = printable_filesize(num_bytes)
    print(f'    Size: {size} across {num_files} files in {num_dirs} directories')
    print(ANSI['black'])
-    num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
+    num_indexed = len(get_indexed_folders(snapshots, out_dir=out_dir))
-    num_archived = len(get_archived_folders(links, out_dir=out_dir))
+    num_archived = len(get_archived_folders(snapshots, out_dir=out_dir))
-    num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
+    num_unarchived = len(get_unarchived_folders(snapshots, out_dir=out_dir))
    print(f'    > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
    print(f'      > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
    print(f'      > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
-    num_present = len(get_present_folders(links, out_dir=out_dir))
+    num_present = len(get_present_folders(snapshots, out_dir=out_dir))
-    num_valid = len(get_valid_folders(links, out_dir=out_dir))
+    num_valid = len(get_valid_folders(snapshots, out_dir=out_dir))
    print()
    print(f'    > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
    print(f'      > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
-    duplicate = get_duplicate_folders(links, out_dir=out_dir)
+    duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
-    orphaned = get_orphaned_folders(links, out_dir=out_dir)
+    orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
-    corrupted = get_corrupted_folders(links, out_dir=out_dir)
+    corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
-    unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
+    unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
    num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
    print(f'      > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
    print(f'        > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
@ -466,7 +466,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
    print(ANSI['reset'])
    if num_indexed:
-        print('    {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
+        print('    {lightred}Hint:{reset} You can list snapshot data directories by status like so:'.format(**ANSI))
        print('        archivebox list --status=<status>  (e.g. indexed, corrupted, archived, etc.)')
    if orphaned:
@ -495,7 +495,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
        print('        archivebox manage createsuperuser')
    print()
-    for snapshot in links.order_by('-updated')[:10]:
+    for snapshot in snapshots.order_by('-updated')[:10]:
        if not snapshot.updated:
            continue
        print(