#!/usr/bin/env python3 __package__ = 'archivebox.cli' __command__ = 'archivebox list' __description__ = 'List all the URLs currently in the archive.' import sys import argparse from ..legacy.util import SmartFormatter, reject_stdin, to_json, to_csv from ..legacy.config import check_data_folder, OUTPUT_DIR from ..legacy.main import ( list_archive_data, get_indexed_folders, get_archived_folders, get_unarchived_folders, get_present_folders, get_valid_folders, get_invalid_folders, get_duplicate_folders, get_orphaned_folders, get_corrupted_folders, get_unrecognized_folders, ) def main(args=None): check_data_folder() args = sys.argv[1:] if args is None else args parser = argparse.ArgumentParser( prog=__command__, description=__description__, add_help=True, formatter_class=SmartFormatter, ) group = parser.add_mutually_exclusive_group() group.add_argument( '--csv', #'-c', type=str, help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension", default=None, ) group.add_argument( '--json', #'-j', action='store_true', help="Print the output in JSON format with all columns included.", ) parser.add_argument( '--sort', #'-s', type=str, help="List the links sorted using the given key, e.g. timestamp or updated.", default=None, ) parser.add_argument( '--before', #'-b', type=float, help="List only links bookmarked before the given timestamp.", default=None, ) parser.add_argument( '--after', #'-a', type=float, help="List only links bookmarked after the given timestamp.", default=None, ) parser.add_argument( '--status', type=str, choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'), default='indexed', help=( 'List only links or data directories that have the given status\n' f' indexed {get_indexed_folders.__doc__} (the default)\n' f' archived {get_archived_folders.__doc__}\n' f' unarchived {get_unarchived_folders.__doc__}\n' '\n' f' present {get_present_folders.__doc__}\n' f' valid {get_valid_folders.__doc__}\n' f' invalid {get_invalid_folders.__doc__}\n' '\n' f' duplicate {get_duplicate_folders.__doc__}\n' f' orphaned {get_orphaned_folders.__doc__}\n' f' corrupted {get_corrupted_folders.__doc__}\n' f' unrecognized {get_unrecognized_folders.__doc__}\n' ) ) parser.add_argument( '--filter-type', type=str, choices=('exact', 'substring', 'domain', 'regex'), default='exact', help='Type of pattern matching to use when filtering URLs', ) parser.add_argument( 'patterns', nargs='*', type=str, default=None, help='List only URLs matching these filter patterns.' ) command = parser.parse_args(args) reject_stdin(__command__) links = list_archive_data( filter_patterns=command.patterns, filter_type=command.filter_type, before=command.before, after=command.after, ) if command.sort: links = sorted(links, key=lambda link: getattr(link, command.sort)) links = list(links) if command.status == 'indexed': folders = get_indexed_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'archived': folders = get_archived_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'unarchived': folders = get_unarchived_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'present': folders = get_present_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'valid': folders = get_valid_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'invalid': folders = get_invalid_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'duplicate': folders = get_duplicate_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'orphaned': folders = get_orphaned_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'corrupted': folders = get_corrupted_folders(links, out_dir=OUTPUT_DIR) elif command.status == 'unrecognized': folders = get_unrecognized_folders(links, out_dir=OUTPUT_DIR) if command.csv: print(to_csv(folders.values(), csv_cols=command.csv.split(','), header=True)) elif command.json: print(to_json(folders.values(), indent=4, sort_keys=True)) else: print('\n'.join(f'{folder} {link}' for folder, link in folders.items())) raise SystemExit(not folders) if __name__ == '__main__': main()