1
0
Fork 0
mirror of synced 2024-07-02 21:10:25 +12:00

working consistent list and remove with filtering

This commit is contained in:
Nick Sweeting 2019-04-11 07:00:26 -04:00
parent 4ca9a0beac
commit d8d8f7c2cc
6 changed files with 242 additions and 31 deletions

View file

@ -5,12 +5,11 @@ __command__ = 'archivebox list'
__description__ = 'List all the URLs currently in the archive.' __description__ = 'List all the URLs currently in the archive.'
import sys import sys
import json
import argparse import argparse
from ..legacy.util import reject_stdin, ExtendedEncoder from ..legacy.util import reject_stdin, to_json, to_csv
from ..legacy.main import list_archive_data, csv_format from ..legacy.main import list_archive_data
def main(args=None): def main(args=None):
@ -33,16 +32,10 @@ def main(args=None):
action='store_true', action='store_true',
help="Print the output in JSON format with all columns included.", help="Print the output in JSON format with all columns included.",
) )
parser.add_argument(
'--filter', #'-f',
type=str,
help="List only URLs matching the given regex pattern.",
default=None,
)
parser.add_argument( parser.add_argument(
'--sort', #'-s', '--sort', #'-s',
type=str, type=str,
help="List the links sorted using the given key, e.g. timestamp or updated", help="List the links sorted using the given key, e.g. timestamp or updated.",
default=None, default=None,
) )
parser.add_argument( parser.add_argument(
@ -57,11 +50,26 @@ def main(args=None):
help="List only URLs bookmarked after the given timestamp.", help="List only URLs bookmarked after the given timestamp.",
default=None, default=None,
) )
parser.add_argument(
'--filter-type',
type=str,
choices=('exact', 'substring', 'domain', 'regex'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'patterns',
nargs='*',
type=str,
default=None,
help='List only URLs matching these filter patterns.'
)
command = parser.parse_args(args) command = parser.parse_args(args)
reject_stdin(__command__) reject_stdin(__command__)
links = list_archive_data( links = list_archive_data(
filter_regex=command.filter, filter_patterns=command.patterns,
filter_type=command.filter_type,
before=command.before, before=command.before,
after=command.after, after=command.after,
) )
@ -69,10 +77,9 @@ def main(args=None):
links = sorted(links, key=lambda link: getattr(link, command.sort)) links = sorted(links, key=lambda link: getattr(link, command.sort))
if command.csv: if command.csv:
print(command.csv) print(to_csv(links, csv_cols=command.csv.split(','), header=True))
print('\n'.join(csv_format(link, command.csv) for link in links))
elif command.json: elif command.json:
print(json.dumps(list(links), indent=4, cls=ExtendedEncoder)) print(to_json(links, indent=4, sort_keys=True))
else: else:
print('\n'.join(link.url for link in links)) print('\n'.join(link.url for link in links))

View file

@ -0,0 +1,87 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox remove'
__description__ = 'Remove the specified URLs from the archive.'
import sys
import argparse
from ..legacy.main import list_archive_data, remove_archive_links
from ..legacy.util import reject_stdin, to_csv, TimedProgress
from ..legacy.config import ANSI
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.add_argument(
'--yes', # '-y',
action='store_true',
help='Remove links instantly without prompting to confirm.',
)
parser.add_argument(
'--delete', # '-r',
action='store_true',
help=(
"In addition to removing the link from the index, "
"also delete its archived content and metadata folder."
),
)
parser.add_argument(
'--before', #'-b',
type=float,
help="List only URLs bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="List only URLs bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--filter-type',
type=str,
choices=('exact', 'substring', 'domain', 'regex'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'pattern',
nargs='?',
type=str,
default=None,
help='URLs matching this filter pattern will be removed from the index.'
)
command = parser.parse_args(args)
reject_stdin(__command__)
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
if stdin_raw_text and command.url:
print(
'[X] You should pass either a pattern as an argument, '
'or pass a list of patterns via stdin, but not both.\n'
)
raise SystemExit(1)
patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
else:
patterns = [command.pattern]
remove_archive_links(
filter_patterns=patterns, filter_type=command.filter_type,
before=command.before, after=command.after,
yes=command.yes, delete=command.delete,
)
if __name__ == '__main__':
main()

View file

@ -15,6 +15,8 @@ from .config import (
FOOTER_INFO, FOOTER_INFO,
TIMEOUT, TIMEOUT,
URL_BLACKLIST_PTN, URL_BLACKLIST_PTN,
ANSI,
stderr,
) )
from .util import ( from .util import (
scheme, scheme,

View file

@ -1,10 +1,10 @@
import re import re
import json import shutil
from typing import List, Optional, Iterable from typing import List, Optional, Iterable
from .schema import Link from .schema import Link
from .util import enforce_types, ExtendedEncoder from .util import enforce_types, TimedProgress, to_csv
from .index import ( from .index import (
links_after_timestamp, links_after_timestamp,
load_links_index, load_links_index,
@ -12,6 +12,7 @@ from .index import (
) )
from .archive_methods import archive_link from .archive_methods import archive_link
from .config import ( from .config import (
ANSI,
ONLY_NEW, ONLY_NEW,
OUTPUT_DIR, OUTPUT_DIR,
check_dependencies, check_dependencies,
@ -61,23 +62,91 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
return all_links return all_links
LINK_FILTERS = {
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
'substring': lambda link, pattern: pattern in link.url,
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
'domain': lambda link, pattern: link.domain == pattern,
}
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
for pattern in filter_patterns:
if LINK_FILTERS[filter_type](link, pattern):
return True
return False
@enforce_types @enforce_types
def list_archive_data(filter_regex: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]: def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links, _ = load_links_index(out_dir=OUTPUT_DIR) all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
pattern = re.compile(filter_regex, re.IGNORECASE) if filter_regex else None
for link in all_links: for link in all_links:
if pattern and not pattern.match(link.url):
continue
if after is not None and float(link.timestamp) < after: if after is not None and float(link.timestamp) < after:
continue continue
if before is not None and float(link.timestamp) > before: if before is not None and float(link.timestamp) > before:
continue continue
yield link if filter_patterns:
if link_matches_filter(link, filter_patterns, filter_type):
yield link
else:
yield link
def csv_format(link: Link, csv_cols: str) -> str: @enforce_types
return ','.join(json.dumps(getattr(link, col), cls=ExtendedEncoder) for col in csv_cols.split(',')) def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None,
yes: bool=False, delete: bool=False):
check_dependencies()
print('[*] Finding links in the archive index matching these {} patterns:'.format(filter_type))
print(' {}'.format(' '.join(filter_patterns)))
timer = TimedProgress(360, prefix=' ')
try:
links = list(list_archive_data(
filter_patterns=filter_patterns,
filter_type=filter_type,
after=after,
before=before,
))
finally:
timer.end()
if not len(links):
print()
print('{red}[X] No matching links found.{reset}'.format(**ANSI))
raise SystemExit(1)
print()
print('-------------------------------------------------------------------')
print(to_csv(links, csv_cols=['link_dir', 'url', 'is_archived', 'num_outputs']))
print('-------------------------------------------------------------------')
print()
if not yes:
resp = input('{lightyellow}[?] Are you sure you want to permanently remove these {} archived links? N/y: {reset}'.format(len(links), **ANSI))
if not resp.lower() == 'y':
raise SystemExit(0)
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
to_keep = []
for link in all_links:
should_remove = (
(after is not None and float(link.timestamp) < after)
or (before is not None and float(link.timestamp) > before)
or link_matches_filter(link, filter_patterns, filter_type)
)
if not should_remove:
to_keep.append(link)
elif should_remove and delete:
shutil.rmtree(link.link_dir)
num_removed = len(all_links) - len(to_keep)
write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
print()
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(num_removed, len(all_links), **ANSI))
print(' Index now contains {} links.'.format(len(to_keep)))

View file

@ -50,16 +50,33 @@ class ArchiveResult:
def from_json(cls, json_info): def from_json(cls, json_info):
from .util import parse_date from .util import parse_date
allowed_fields = {f.name for f in fields(cls)}
info = { info = {
key: val key: val
for key, val in json_info.items() for key, val in json_info.items()
if key in allowed_fields if key in cls.field_names()
} }
info['start_ts'] = parse_date(info['start_ts']) info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts']) info['end_ts'] = parse_date(info['end_ts'])
return cls(**info) return cls(**info)
def to_json(self, indent=4, sort_keys=True):
from .util import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, cols=None):
from .util import to_json
cols = cols or self.field_names()
return ','.join(
to_json(getattr(self, col), indent=False)
for col in cols
)
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]
@property @property
def duration(self) -> int: def duration(self) -> int:
return (self.end_ts - self.start_ts).seconds return (self.end_ts - self.start_ts).seconds
@ -145,11 +162,10 @@ class Link:
def from_json(cls, json_info): def from_json(cls, json_info):
from .util import parse_date from .util import parse_date
allowed_fields = {f.name for f in fields(cls)}
info = { info = {
key: val key: val
for key, val in json_info.items() for key, val in json_info.items()
if key in allowed_fields if key in cls.field_names()
} }
info['updated'] = parse_date(info['updated']) info['updated'] = parse_date(info['updated'])
@ -166,6 +182,22 @@ class Link:
info['history'] = cast_history info['history'] = cast_history
return cls(**info) return cls(**info)
def to_json(self, indent=4, sort_keys=True):
from .util import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, csv_cols: List[str]):
from .util import to_json
return ','.join(
to_json(getattr(self, col), indent=None)
for col in csv_cols
)
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]
@property @property
def link_dir(self) -> str: def link_dir(self) -> str:

View file

@ -6,7 +6,7 @@ import time
import shutil import shutil
from json import JSONEncoder from json import JSONEncoder
from typing import List, Optional, Any, Union from typing import List, Optional, Any, Union, IO
from inspect import signature from inspect import signature
from functools import wraps from functools import wraps
from hashlib import sha256 from hashlib import sha256
@ -616,13 +616,27 @@ class ExtendedEncoder(JSONEncoder):
return JSONEncoder.default(self, obj) return JSONEncoder.default(self, obj)
def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> Optional[str]:
if file:
json.dump(obj, file, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
return None
else:
return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, header: bool=True) -> str:
csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
header_str = '{}\n'.format(','.join(csv_cols)) if header else ''
return header_str + '\n'.join(link.to_csv(csv_cols=csv_cols) for link in links)
def atomic_write(contents: Union[dict, str], path: str) -> None: def atomic_write(contents: Union[dict, str], path: str) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename""" """Safe atomic write to filesystem by writing to temp file + atomic rename"""
try: try:
tmp_file = '{}.tmp'.format(path) tmp_file = '{}.tmp'.format(path)
with open(tmp_file, 'w+', encoding='utf-8') as f: with open(tmp_file, 'w+', encoding='utf-8') as f:
if isinstance(contents, dict): if isinstance(contents, dict):
json.dump(contents, f, indent=4, cls=ExtendedEncoder) to_json(contents, file=f)
else: else:
f.write(contents) f.write(contents)