1
0
Fork 0
mirror of synced 2024-06-26 10:00:19 +12:00

add archivebox info command to scan data dir

This commit is contained in:
Nick Sweeting 2019-04-22 14:34:30 -04:00
parent 50b947f41d
commit ab68819332
4 changed files with 105 additions and 3 deletions

View file

@ -0,0 +1,28 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox info'
__description__ = 'Print out some info and statistics about the archive collection'
import sys
import argparse
from ..legacy.main import info
from ..legacy.util import reject_stdin
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
info()
if __name__ == '__main__':
main()

View file

@ -5,7 +5,12 @@ import shutil
from typing import List, Optional, Iterable
from .schema import Link
from .util import enforce_types, TimedProgress
from .util import (
enforce_types,
TimedProgress,
get_dir_size,
human_readable_size,
)
from .index import (
links_after_timestamp,
load_main_index,
@ -119,6 +124,47 @@ def init():
print(' archivebox help')
@enforce_types
def info():
all_links = load_main_index(out_dir=OUTPUT_DIR)
print('{green}[*] Scanning archive collection main index with {} links:{reset}'.format(len(all_links), **ANSI))
print(f' {OUTPUT_DIR}')
num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False)
size = human_readable_size(num_bytes)
print(f' > Index Size: {size} across {num_files} files in')
print()
print('{green}[*] Scanning archive collection data directory with {} entries:{reset}'.format(len(all_links), **ANSI))
print(f' {ARCHIVE_DIR}')
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
size = human_readable_size(num_bytes)
print(f' > Total Size: {size} across {num_files} files in {num_dirs} directories')
print()
link_data_dirs = {link.link_dir for link in all_links}
valid_archive_dirs = set()
num_invalid = 0
for entry in os.scandir(ARCHIVE_DIR):
if entry.is_dir(follow_symlinks=True):
if os.path.exists(os.path.join(entry.path, 'index.json')):
valid_archive_dirs.add(entry.path)
else:
num_invalid += 1
print(f' > {len(valid_archive_dirs)} valid archive data directories (valid directories matched to links in the index)')
num_unarchived = sum(1 for link in all_links if link.link_dir not in valid_archive_dirs)
print(f' > {num_unarchived} missing data directories (directories missing for links in the index)')
print(f' > {num_invalid} invalid data directories (directories present that don\'t contain an index file)')
num_orphaned = sum(1 for data_dir in valid_archive_dirs if data_dir not in link_data_dirs)
print(f' > {num_orphaned} orphaned data directories (directories present for links that don\'t exist in the index)')
@enforce_types
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:

View file

@ -27,7 +27,6 @@ MAIN_INDEX_HEADER = {
'copyright_info': FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'cmd': sys.argv,
'version': VERSION,
'git_sha': GIT_SHA,
'website': 'https://ArchiveBox.io',
@ -72,6 +71,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
**MAIN_INDEX_HEADER,
'num_links': len(links),
'updated': datetime.now(),
'last_run_cmd': sys.argv,
'links': links,
}
atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))

View file

@ -7,7 +7,7 @@ import shutil
from string import Template
from json import JSONEncoder
from typing import List, Optional, Any, Union, IO, Mapping
from typing import List, Optional, Any, Union, IO, Mapping, Tuple
from inspect import signature
from functools import wraps
from hashlib import sha256
@ -561,6 +561,34 @@ def copy_and_overwrite(from_path: str, to_path: str):
with open(from_path, 'rb') as src:
atomic_write(src.read(), to_path)
@enforce_types
def get_dir_size(path: str, recursive: bool=True) -> Tuple[int, int, int]:
num_bytes, num_dirs, num_files = 0, 0, 0
for entry in os.scandir(path):
if entry.is_dir(follow_symlinks=False):
if not recursive:
continue
num_dirs += 1
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
num_bytes += bytes_inside
num_dirs += dirs_inside
num_files += files_inside
else:
num_bytes += entry.stat(follow_symlinks=False).st_size
num_files += 1
return num_bytes, num_dirs, num_files
@enforce_types
def human_readable_size(num_bytes: Union[int, float]) -> str:
for count in ['Bytes','KB','MB','GB']:
if num_bytes > -1024.0 and num_bytes < 1024.0:
return '%3.1f%s' % (num_bytes, count)
num_bytes /= 1024.0
return '%3.1f%s' % (num_bytes, 'TB')
@enforce_types
def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""