1
0
Fork 0
mirror of synced 2024-06-28 11:00:35 +12:00

working consistent list and remove with filtering

This commit is contained in:
Nick Sweeting 2019-04-11 07:00:26 -04:00
parent 4ca9a0beac
commit d8d8f7c2cc
6 changed files with 242 additions and 31 deletions

View file

@ -5,12 +5,11 @@ __command__ = 'archivebox list'
__description__ = 'List all the URLs currently in the archive.'
import sys
import json
import argparse
from ..legacy.util import reject_stdin, ExtendedEncoder
from ..legacy.main import list_archive_data, csv_format
from ..legacy.util import reject_stdin, to_json, to_csv
from ..legacy.main import list_archive_data
def main(args=None):
@ -33,16 +32,10 @@ def main(args=None):
action='store_true',
help="Print the output in JSON format with all columns included.",
)
parser.add_argument(
'--filter', #'-f',
type=str,
help="List only URLs matching the given regex pattern.",
default=None,
)
parser.add_argument(
'--sort', #'-s',
type=str,
help="List the links sorted using the given key, e.g. timestamp or updated",
help="List the links sorted using the given key, e.g. timestamp or updated.",
default=None,
)
parser.add_argument(
@ -57,11 +50,26 @@ def main(args=None):
help="List only URLs bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--filter-type',
type=str,
choices=('exact', 'substring', 'domain', 'regex'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'patterns',
nargs='*',
type=str,
default=None,
help='List only URLs matching these filter patterns.'
)
command = parser.parse_args(args)
reject_stdin(__command__)
links = list_archive_data(
filter_regex=command.filter,
filter_patterns=command.patterns,
filter_type=command.filter_type,
before=command.before,
after=command.after,
)
@ -69,10 +77,9 @@ def main(args=None):
links = sorted(links, key=lambda link: getattr(link, command.sort))
if command.csv:
print(command.csv)
print('\n'.join(csv_format(link, command.csv) for link in links))
print(to_csv(links, csv_cols=command.csv.split(','), header=True))
elif command.json:
print(json.dumps(list(links), indent=4, cls=ExtendedEncoder))
print(to_json(links, indent=4, sort_keys=True))
else:
print('\n'.join(link.url for link in links))

View file

@ -0,0 +1,87 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox remove'
__description__ = 'Remove the specified URLs from the archive.'
import sys
import argparse
from ..legacy.main import list_archive_data, remove_archive_links
from ..legacy.util import reject_stdin, to_csv, TimedProgress
from ..legacy.config import ANSI
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.add_argument(
'--yes', # '-y',
action='store_true',
help='Remove links instantly without prompting to confirm.',
)
parser.add_argument(
'--delete', # '-r',
action='store_true',
help=(
"In addition to removing the link from the index, "
"also delete its archived content and metadata folder."
),
)
parser.add_argument(
'--before', #'-b',
type=float,
help="List only URLs bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="List only URLs bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--filter-type',
type=str,
choices=('exact', 'substring', 'domain', 'regex'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'pattern',
nargs='?',
type=str,
default=None,
help='URLs matching this filter pattern will be removed from the index.'
)
command = parser.parse_args(args)
reject_stdin(__command__)
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
if stdin_raw_text and command.url:
print(
'[X] You should pass either a pattern as an argument, '
'or pass a list of patterns via stdin, but not both.\n'
)
raise SystemExit(1)
patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
else:
patterns = [command.pattern]
remove_archive_links(
filter_patterns=patterns, filter_type=command.filter_type,
before=command.before, after=command.after,
yes=command.yes, delete=command.delete,
)
if __name__ == '__main__':
main()

View file

@ -15,6 +15,8 @@ from .config import (
FOOTER_INFO,
TIMEOUT,
URL_BLACKLIST_PTN,
ANSI,
stderr,
)
from .util import (
scheme,

View file

@ -1,10 +1,10 @@
import re
import json
import shutil
from typing import List, Optional, Iterable
from .schema import Link
from .util import enforce_types, ExtendedEncoder
from .util import enforce_types, TimedProgress, to_csv
from .index import (
links_after_timestamp,
load_links_index,
@ -12,6 +12,7 @@ from .index import (
)
from .archive_methods import archive_link
from .config import (
ANSI,
ONLY_NEW,
OUTPUT_DIR,
check_dependencies,
@ -61,23 +62,91 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
return all_links
LINK_FILTERS = {
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
'substring': lambda link, pattern: pattern in link.url,
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
'domain': lambda link, pattern: link.domain == pattern,
}
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
for pattern in filter_patterns:
if LINK_FILTERS[filter_type](link, pattern):
return True
return False
@enforce_types
def list_archive_data(filter_regex: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
pattern = re.compile(filter_regex, re.IGNORECASE) if filter_regex else None
for link in all_links:
if pattern and not pattern.match(link.url):
continue
if after is not None and float(link.timestamp) < after:
continue
if before is not None and float(link.timestamp) > before:
continue
yield link
if filter_patterns:
if link_matches_filter(link, filter_patterns, filter_type):
yield link
else:
yield link
def csv_format(link: Link, csv_cols: str) -> str:
return ','.join(json.dumps(getattr(link, col), cls=ExtendedEncoder) for col in csv_cols.split(','))
@enforce_types
def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None,
yes: bool=False, delete: bool=False):
check_dependencies()
print('[*] Finding links in the archive index matching these {} patterns:'.format(filter_type))
print(' {}'.format(' '.join(filter_patterns)))
timer = TimedProgress(360, prefix=' ')
try:
links = list(list_archive_data(
filter_patterns=filter_patterns,
filter_type=filter_type,
after=after,
before=before,
))
finally:
timer.end()
if not len(links):
print()
print('{red}[X] No matching links found.{reset}'.format(**ANSI))
raise SystemExit(1)
print()
print('-------------------------------------------------------------------')
print(to_csv(links, csv_cols=['link_dir', 'url', 'is_archived', 'num_outputs']))
print('-------------------------------------------------------------------')
print()
if not yes:
resp = input('{lightyellow}[?] Are you sure you want to permanently remove these {} archived links? N/y: {reset}'.format(len(links), **ANSI))
if not resp.lower() == 'y':
raise SystemExit(0)
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
to_keep = []
for link in all_links:
should_remove = (
(after is not None and float(link.timestamp) < after)
or (before is not None and float(link.timestamp) > before)
or link_matches_filter(link, filter_patterns, filter_type)
)
if not should_remove:
to_keep.append(link)
elif should_remove and delete:
shutil.rmtree(link.link_dir)
num_removed = len(all_links) - len(to_keep)
write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
print()
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(num_removed, len(all_links), **ANSI))
print(' Index now contains {} links.'.format(len(to_keep)))

View file

@ -50,16 +50,33 @@ class ArchiveResult:
def from_json(cls, json_info):
from .util import parse_date
allowed_fields = {f.name for f in fields(cls)}
info = {
key: val
for key, val in json_info.items()
if key in allowed_fields
if key in cls.field_names()
}
info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts'])
return cls(**info)
def to_json(self, indent=4, sort_keys=True):
from .util import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, cols=None):
from .util import to_json
cols = cols or self.field_names()
return ','.join(
to_json(getattr(self, col), indent=False)
for col in cols
)
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]
@property
def duration(self) -> int:
return (self.end_ts - self.start_ts).seconds
@ -145,11 +162,10 @@ class Link:
def from_json(cls, json_info):
from .util import parse_date
allowed_fields = {f.name for f in fields(cls)}
info = {
key: val
for key, val in json_info.items()
if key in allowed_fields
if key in cls.field_names()
}
info['updated'] = parse_date(info['updated'])
@ -166,6 +182,22 @@ class Link:
info['history'] = cast_history
return cls(**info)
def to_json(self, indent=4, sort_keys=True):
from .util import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, csv_cols: List[str]):
from .util import to_json
return ','.join(
to_json(getattr(self, col), indent=None)
for col in csv_cols
)
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]
@property
def link_dir(self) -> str:

View file

@ -6,7 +6,7 @@ import time
import shutil
from json import JSONEncoder
from typing import List, Optional, Any, Union
from typing import List, Optional, Any, Union, IO
from inspect import signature
from functools import wraps
from hashlib import sha256
@ -616,13 +616,27 @@ class ExtendedEncoder(JSONEncoder):
return JSONEncoder.default(self, obj)
def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> Optional[str]:
if file:
json.dump(obj, file, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
return None
else:
return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, header: bool=True) -> str:
csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
header_str = '{}\n'.format(','.join(csv_cols)) if header else ''
return header_str + '\n'.join(link.to_csv(csv_cols=csv_cols) for link in links)
def atomic_write(contents: Union[dict, str], path: str) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
try:
tmp_file = '{}.tmp'.format(path)
with open(tmp_file, 'w+', encoding='utf-8') as f:
if isinstance(contents, dict):
json.dump(contents, f, indent=4, cls=ExtendedEncoder)
to_json(contents, file=f)
else:
f.write(contents)