From ab6881933286a38f28043fe284d0cc53be0773ab Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 22 Apr 2019 14:34:30 -0400 Subject: [PATCH] add archivebox info command to scan data dir --- archivebox/cli/archivebox_info.py | 28 ++++++++++++++++++ archivebox/legacy/main.py | 48 ++++++++++++++++++++++++++++++- archivebox/legacy/storage/json.py | 2 +- archivebox/legacy/util.py | 30 ++++++++++++++++++- 4 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 archivebox/cli/archivebox_info.py diff --git a/archivebox/cli/archivebox_info.py b/archivebox/cli/archivebox_info.py new file mode 100644 index 00000000..38d7eb48 --- /dev/null +++ b/archivebox/cli/archivebox_info.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox info' +__description__ = 'Print out some info and statistics about the archive collection' + +import sys +import argparse + +from ..legacy.main import info +from ..legacy.util import reject_stdin + + +def main(args=None): + args = sys.argv[1:] if args is None else args + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=True, + ) + parser.parse_args(args) + reject_stdin(__command__) + + info() + +if __name__ == '__main__': + main() diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py index 7296add0..49e4903b 100644 --- a/archivebox/legacy/main.py +++ b/archivebox/legacy/main.py @@ -5,7 +5,12 @@ import shutil from typing import List, Optional, Iterable from .schema import Link -from .util import enforce_types, TimedProgress +from .util import ( + enforce_types, + TimedProgress, + get_dir_size, + human_readable_size, +) from .index import ( links_after_timestamp, load_main_index, @@ -119,6 +124,47 @@ def init(): print(' archivebox help') +@enforce_types +def info(): + all_links = load_main_index(out_dir=OUTPUT_DIR) + + print('{green}[*] Scanning archive collection main index with {} links:{reset}'.format(len(all_links), **ANSI)) + print(f' {OUTPUT_DIR}') + + num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False) + size = human_readable_size(num_bytes) + print(f' > Index Size: {size} across {num_files} files in') + print() + + print('{green}[*] Scanning archive collection data directory with {} entries:{reset}'.format(len(all_links), **ANSI)) + print(f' {ARCHIVE_DIR}') + + num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) + size = human_readable_size(num_bytes) + print(f' > Total Size: {size} across {num_files} files in {num_dirs} directories') + print() + + link_data_dirs = {link.link_dir for link in all_links} + valid_archive_dirs = set() + num_invalid = 0 + for entry in os.scandir(ARCHIVE_DIR): + if entry.is_dir(follow_symlinks=True): + if os.path.exists(os.path.join(entry.path, 'index.json')): + valid_archive_dirs.add(entry.path) + else: + num_invalid += 1 + + print(f' > {len(valid_archive_dirs)} valid archive data directories (valid directories matched to links in the index)') + + num_unarchived = sum(1 for link in all_links if link.link_dir not in valid_archive_dirs) + print(f' > {num_unarchived} missing data directories (directories missing for links in the index)') + + print(f' > {num_invalid} invalid data directories (directories present that don\'t contain an index file)') + + num_orphaned = sum(1 for data_dir in valid_archive_dirs if data_dir not in link_data_dirs) + print(f' > {num_orphaned} orphaned data directories (directories present for links that don\'t exist in the index)') + + @enforce_types def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]: diff --git a/archivebox/legacy/storage/json.py b/archivebox/legacy/storage/json.py index 183f3975..a6027628 100644 --- a/archivebox/legacy/storage/json.py +++ b/archivebox/legacy/storage/json.py @@ -27,7 +27,6 @@ MAIN_INDEX_HEADER = { 'copyright_info': FOOTER_INFO, 'meta': { 'project': 'ArchiveBox', - 'cmd': sys.argv, 'version': VERSION, 'git_sha': GIT_SHA, 'website': 'https://ArchiveBox.io', @@ -72,6 +71,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: **MAIN_INDEX_HEADER, 'num_links': len(links), 'updated': datetime.now(), + 'last_run_cmd': sys.argv, 'links': links, } atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME)) diff --git a/archivebox/legacy/util.py b/archivebox/legacy/util.py index c4f14328..e30782fa 100644 --- a/archivebox/legacy/util.py +++ b/archivebox/legacy/util.py @@ -7,7 +7,7 @@ import shutil from string import Template from json import JSONEncoder -from typing import List, Optional, Any, Union, IO, Mapping +from typing import List, Optional, Any, Union, IO, Mapping, Tuple from inspect import signature from functools import wraps from hashlib import sha256 @@ -561,6 +561,34 @@ def copy_and_overwrite(from_path: str, to_path: str): with open(from_path, 'rb') as src: atomic_write(src.read(), to_path) + +@enforce_types +def get_dir_size(path: str, recursive: bool=True) -> Tuple[int, int, int]: + num_bytes, num_dirs, num_files = 0, 0, 0 + for entry in os.scandir(path): + if entry.is_dir(follow_symlinks=False): + if not recursive: + continue + num_dirs += 1 + bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path) + num_bytes += bytes_inside + num_dirs += dirs_inside + num_files += files_inside + else: + num_bytes += entry.stat(follow_symlinks=False).st_size + num_files += 1 + return num_bytes, num_dirs, num_files + + +@enforce_types +def human_readable_size(num_bytes: Union[int, float]) -> str: + for count in ['Bytes','KB','MB','GB']: + if num_bytes > -1024.0 and num_bytes < 1024.0: + return '%3.1f%s' % (num_bytes, count) + num_bytes /= 1024.0 + return '%3.1f%s' % (num_bytes, 'TB') + + @enforce_types def chrome_args(**options) -> List[str]: """helper to build up a chrome shell command with arguments"""