1
0
Fork 0
mirror of synced 2024-06-27 10:30:38 +12:00
ArchiveBox/archivebox/purge.py
2018-12-31 20:53:01 -05:00

55 lines
1.5 KiB
Python
Executable file

#!/usr/bin/env python3
import argparse
import re
from typing import List
from archive import parse_json_link_index
from config import OUTPUT_DIR
from index import write_json_links_index
def cleanup_index(patterns: List[str], yes=False):
regexes = [re.compile(p) for p in patterns]
index = parse_json_link_index(OUTPUT_DIR)
links = index['links']
filtered = []
remaining = []
for l in links:
url = l['url']
for r in regexes:
if r.search(url):
filtered.append((l, r))
break
else:
remaining.append(l)
print("Filtered out {}/{} urls:".format(len(filtered), len(links)))
for link, regex in filtered:
url = link['url']
print(" {url} via {regex}".format(url=url, regex=regex.pattern))
proceed = False
if yes:
proceed = True
else:
res = input("Remove {} entries from index? [y/n] ".format(len(filtered)))
proceed = res.strip().lower() in ('y', 'yes')
if proceed:
write_json_links_index(OUTPUT_DIR, remaining)
else:
exit('aborting')
if __name__ == '__main__':
p = argparse.ArgumentParser('Index purging tool')
p.add_argument('--regex', '-r', action='append', help='Python regex to filter out')
p.add_argument('--yes', action='store_true', default=False, help='Do not propmpt for confirmation')
args = p.parse_args()
regexes = args.regex
cleanup_index(regexes, yes=args.yes)