working consistent list and remove with filtering

2024-07-02 21:10:25 +12:00 · 2019-04-11 07:00:26 -04:00 · 2019-04-11 07:00:26 -04:00 · d8d8f7c2cc
parent 4ca9a0beac
commit d8d8f7c2cc
6 changed files with 242 additions and 31 deletions
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@ -5,12 +5,11 @@ __command__ = 'archivebox list'
 __description__ = 'List all the URLs currently in the archive.'
 import sys
 import json
 import argparse
-from ..legacy.util import reject_stdin, ExtendedEncoder
+from ..legacy.util import reject_stdin, to_json, to_csv
-from ..legacy.main import list_archive_data, csv_format
+from ..legacy.main import list_archive_data
 def main(args=None):
@ -33,16 +32,10 @@ def main(args=None):
        action='store_true',
        help="Print the output in JSON format with all columns included.",
    )
    parser.add_argument(
        '--filter', #'-f',
        type=str,
        help="List only URLs matching the given regex pattern.",
        default=None,
    )
    parser.add_argument(
        '--sort', #'-s',
        type=str,
-        help="List the links sorted using the given key, e.g. timestamp or updated",
+        help="List the links sorted using the given key, e.g. timestamp or updated.",
        default=None,
    )
    parser.add_argument(
@ -57,11 +50,26 @@ def main(args=None):
        help="List only URLs bookmarked after the given timestamp.",
        default=None,
    )
    parser.add_argument(
        '--filter-type',
        type=str,
        choices=('exact', 'substring', 'domain', 'regex'),
        default='exact',
        help='Type of pattern matching to use when filtering URLs',
    )
    parser.add_argument(
        'patterns',
        nargs='*',
        type=str,
        default=None,
        help='List only URLs matching these filter patterns.'
    )
    command = parser.parse_args(args)
    reject_stdin(__command__)
    links = list_archive_data(
-        filter_regex=command.filter,
+        filter_patterns=command.patterns,
        filter_type=command.filter_type,
        before=command.before,
        after=command.after,
    )
@ -69,10 +77,9 @@ def main(args=None):
        links = sorted(links, key=lambda link: getattr(link, command.sort))
    if command.csv:
-        print(command.csv)
+        print(to_csv(links, csv_cols=command.csv.split(','), header=True))
        print('\n'.join(csv_format(link, command.csv) for link in links))
    elif command.json:
-        print(json.dumps(list(links), indent=4, cls=ExtendedEncoder))
+        print(to_json(links, indent=4, sort_keys=True))
    else:
        print('\n'.join(link.url for link in links))
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@ -0,0 +1,87 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox remove'
 __description__ = 'Remove the specified URLs from the archive.'
 import sys
 import argparse
 from ..legacy.main import list_archive_data, remove_archive_links
 from ..legacy.util import reject_stdin, to_csv, TimedProgress
 from ..legacy.config import ANSI
 def main(args=None):
    args = sys.argv[1:] if args is None else args
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=__description__,
        add_help=True,
    )
    parser.add_argument(
        '--yes', # '-y',
        action='store_true',
        help='Remove links instantly without prompting to confirm.',
    )
    parser.add_argument(
        '--delete', # '-r',
        action='store_true',
        help=(
            "In addition to removing the link from the index, "
            "also delete its archived content and metadata folder."
        ),
    )
    parser.add_argument(
        '--before', #'-b',
        type=float,
        help="List only URLs bookmarked before the given timestamp.",
        default=None,
    )
    parser.add_argument(
        '--after', #'-a',
        type=float,
        help="List only URLs bookmarked after the given timestamp.",
        default=None,
    )
    parser.add_argument(
        '--filter-type',
        type=str,
        choices=('exact', 'substring', 'domain', 'regex'),
        default='exact',
        help='Type of pattern matching to use when filtering URLs',
    )
    parser.add_argument(
        'pattern',
        nargs='?',
        type=str,
        default=None,
        help='URLs matching this filter pattern will be removed from the index.'
    )
    command = parser.parse_args(args)
    reject_stdin(__command__)
    if not sys.stdin.isatty():
        stdin_raw_text = sys.stdin.read()
        if stdin_raw_text and command.url:
            print(
                '[X] You should pass either a pattern as an argument, '
                'or pass a list of patterns via stdin, but not both.\n'
            )
            raise SystemExit(1)
        patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
    else:
        patterns = [command.pattern]
    remove_archive_links(
        filter_patterns=patterns, filter_type=command.filter_type,
        before=command.before, after=command.after,
        yes=command.yes, delete=command.delete,
    )
 if __name__ == '__main__':
    main()
--- a/archivebox/legacy/index.py
+++ b/archivebox/legacy/index.py
@ -15,6 +15,8 @@ from .config import (
    FOOTER_INFO,
    TIMEOUT,
    URL_BLACKLIST_PTN,
    ANSI,
    stderr,
 )
 from .util import (
    scheme,
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@ -1,10 +1,10 @@
 import re
-import json
+import shutil
 from typing import List, Optional, Iterable
 from .schema import Link
-from .util import enforce_types, ExtendedEncoder
+from .util import enforce_types, TimedProgress, to_csv
 from .index import (
    links_after_timestamp,
    load_links_index,
@ -12,6 +12,7 @@ from .index import (
 )
 from .archive_methods import archive_link
 from .config import (
    ANSI,
    ONLY_NEW,
    OUTPUT_DIR,
    check_dependencies,
@ -61,23 +62,91 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
    return all_links
 LINK_FILTERS = {
    'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
    'substring': lambda link, pattern: pattern in link.url,
    'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
    'domain': lambda link, pattern: link.domain == pattern,
 }
 def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
    for pattern in filter_patterns:
        if LINK_FILTERS[filter_type](link, pattern):
            return True
    return False
@enforce_types
-def list_archive_data(filter_regex: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
+def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
                      after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
    pattern = re.compile(filter_regex, re.IGNORECASE) if filter_regex else None
    for link in all_links:
        if pattern and not pattern.match(link.url):
            continue
        if after is not None and float(link.timestamp) < after:
            continue
        if before is not None and float(link.timestamp) > before:
            continue
-        yield link
+        if filter_patterns:
            if link_matches_filter(link, filter_patterns, filter_type):
                yield link
        else:
            yield link
-def csv_format(link: Link, csv_cols: str) -> str:
+@enforce_types
-    return ','.join(json.dumps(getattr(link, col), cls=ExtendedEncoder) for col in csv_cols.split(','))
+def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
                         after: Optional[float]=None, before: Optional[float]=None,
                         yes: bool=False, delete: bool=False):
    check_dependencies()
    print('[*] Finding links in the archive index matching these {} patterns:'.format(filter_type))
    print('    {}'.format(' '.join(filter_patterns)))
    timer = TimedProgress(360, prefix='      ')
    try:
        links = list(list_archive_data(
            filter_patterns=filter_patterns,
            filter_type=filter_type,
            after=after,
            before=before,
        ))
    finally:
        timer.end()
    if not len(links):
        print()
        print('{red}[X] No matching links found.{reset}'.format(**ANSI))
        raise SystemExit(1)
    print()
    print('-------------------------------------------------------------------')
    print(to_csv(links, csv_cols=['link_dir', 'url', 'is_archived', 'num_outputs']))
    print('-------------------------------------------------------------------')
    print()
    if not yes:
        resp = input('{lightyellow}[?] Are you sure you want to permanently remove these {} archived links? N/y: {reset}'.format(len(links), **ANSI))
        if not resp.lower() == 'y':
            raise SystemExit(0)
    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
    to_keep = []
    for link in all_links:
        should_remove = (
            (after is not None and float(link.timestamp) < after)
            or (before is not None and float(link.timestamp) > before)
            or link_matches_filter(link, filter_patterns, filter_type)
        )
        if not should_remove:
            to_keep.append(link)
        elif should_remove and delete:
            shutil.rmtree(link.link_dir)
    num_removed = len(all_links) - len(to_keep)
    write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
    print()
    print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(num_removed, len(all_links), **ANSI))
    print('    Index now contains {} links.'.format(len(to_keep)))
--- a/archivebox/legacy/schema.py
+++ b/archivebox/legacy/schema.py
@ -50,16 +50,33 @@ class ArchiveResult:
    def from_json(cls, json_info):
        from .util import parse_date
        allowed_fields = {f.name for f in fields(cls)}
        info = {
            key: val
            for key, val in json_info.items()
-            if key in allowed_fields
+            if key in cls.field_names()
        }
        info['start_ts'] = parse_date(info['start_ts'])
        info['end_ts'] = parse_date(info['end_ts'])
        return cls(**info)
    def to_json(self, indent=4, sort_keys=True):
        from .util import to_json
        return to_json(self, indent=indent, sort_keys=sort_keys)
    def to_csv(self, cols=None):
        from .util import to_json
        cols = cols or self.field_names()
        return ','.join(
            to_json(getattr(self, col), indent=False)
            for col in cols
        )
    @classmethod
    def field_names(cls):
        return [f.name for f in fields(cls)]
    @property
    def duration(self) -> int:
        return (self.end_ts - self.start_ts).seconds
@ -145,11 +162,10 @@ class Link:
    def from_json(cls, json_info):
        from .util import parse_date
        allowed_fields = {f.name for f in fields(cls)}
        info = {
            key: val
            for key, val in json_info.items()
-            if key in allowed_fields
+            if key in cls.field_names()
        }
        info['updated'] = parse_date(info['updated'])
@ -166,6 +182,22 @@ class Link:
        info['history'] = cast_history
        return cls(**info)
    def to_json(self, indent=4, sort_keys=True):
        from .util import to_json
        return to_json(self, indent=indent, sort_keys=sort_keys)
    def to_csv(self, csv_cols: List[str]):
        from .util import to_json
        return ','.join(
            to_json(getattr(self, col), indent=None)
            for col in csv_cols
        )
    @classmethod
    def field_names(cls):
        return [f.name for f in fields(cls)]
    @property
    def link_dir(self) -> str:
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@ -6,7 +6,7 @@ import time
 import shutil
 from json import JSONEncoder
-from typing import List, Optional, Any, Union
+from typing import List, Optional, Any, Union, IO
 from inspect import signature
 from functools import wraps
 from hashlib import sha256
@ -616,13 +616,27 @@ class ExtendedEncoder(JSONEncoder):
        return JSONEncoder.default(self, obj)
 def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> Optional[str]:
    if file:
        json.dump(obj, file, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
        return None
    else:
        return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
 def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, header: bool=True) -> str:
    csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
    header_str = '{}\n'.format(','.join(csv_cols)) if header else ''
    return header_str + '\n'.join(link.to_csv(csv_cols=csv_cols) for link in links)
 def atomic_write(contents: Union[dict, str], path: str) -> None:
    """Safe atomic write to filesystem by writing to temp file + atomic rename"""
    try:
        tmp_file = '{}.tmp'.format(path)
        with open(tmp_file, 'w+', encoding='utf-8') as f:
            if isinstance(contents, dict):
-                json.dump(contents, f, indent=4, cls=ExtendedEncoder)
+                to_json(contents, file=f)
            else:
                f.write(contents)