1
0
Fork 0
mirror of synced 2024-07-05 06:20:37 +12:00
ArchiveBox/archivebox/legacy/purge.py

90 lines
2.2 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2019-02-05 10:18:49 +13:00
import re
2019-02-05 10:18:49 +13:00
from argparse import ArgumentParser
from os.path import exists, join
from shutil import rmtree
from typing import List
from .config import ARCHIVE_DIR, OUTPUT_DIR
from .index import (
parse_json_links_index,
write_html_links_index,
write_json_links_index,
)
2019-02-05 10:18:49 +13:00
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
if not exists(join(OUTPUT_DIR, 'index.json')):
exit('index.json is missing; nothing to do')
2019-02-05 10:18:49 +13:00
compiled = [re.compile(r) for r in regexes]
2019-03-31 14:29:16 +13:00
links = parse_json_links_index(OUTPUT_DIR)
filtered = []
remaining = []
2019-02-05 10:18:49 +13:00
2019-03-31 14:29:16 +13:00
for link in links:
url = link.url
2019-02-05 10:18:49 +13:00
for r in compiled:
if r.search(url):
2019-03-31 14:29:16 +13:00
filtered.append((link, r))
break
else:
2019-03-31 14:29:16 +13:00
remaining.append(link)
2019-02-05 10:18:49 +13:00
if not filtered:
exit('Search did not match any entries.')
print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
for link, regex in filtered:
2019-03-31 14:29:16 +13:00
url = link.url
2019-02-05 10:18:49 +13:00
print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
2019-02-05 10:18:49 +13:00
if not proceed:
answer = input('Remove {} entries from index? [y/n] '.format(
len(filtered)))
proceed = answer.strip().lower() in ('y', 'yes')
2019-02-05 10:18:49 +13:00
if not proceed:
exit('Aborted')
write_json_links_index(OUTPUT_DIR, remaining)
write_html_links_index(OUTPUT_DIR, remaining)
if delete:
for link, _ in filtered:
data_dir = join(ARCHIVE_DIR, link['timestamp'])
if exists(data_dir):
rmtree(data_dir)
if __name__ == '__main__':
2019-02-05 10:18:49 +13:00
p = ArgumentParser('Index purging tool')
p.add_argument(
'--regex',
'-r',
action='append',
help='Regular expression matching URLs to purge',
)
p.add_argument(
'--delete',
'-d',
action='store_true',
default=False,
help='Delete webpage files from archive',
)
p.add_argument(
'--yes',
'-y',
action='store_true',
default=False,
help='Do not prompt for confirmation',
)
args = p.parse_args()
2019-02-05 10:18:49 +13:00
if args.regex:
cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
else:
p.print_help()