From d06775923b3876c63895d0c16d092e07ddb075cb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 22 Mar 2019 21:38:08 -0400 Subject: [PATCH] move latest to derived data using history --- archivebox/archive_methods.py | 4 ---- archivebox/index.py | 8 +++---- archivebox/links.py | 30 +------------------------ archivebox/util.py | 41 +++++++++++++++++++++++++---------- 4 files changed, 34 insertions(+), 49 deletions(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 39d6448a..8b0bf103 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -90,8 +90,6 @@ def archive_link(link_dir, link): for method_name, should_run, method_function in ARCHIVE_METHODS: if method_name not in link['history']: link['history'][method_name] = [] - if method_name not in link['latest']: - link['latest'][method_name] = None if not should_run(link_dir, link): continue @@ -105,8 +103,6 @@ def archive_link(link_dir, link): log_archive_method_finished(result) link['history'][method_name].append(result) - if result['status'] == 'succeeded': - link['latest'][method_name] = result['output'] write_link_index(link_dir, link) patch_links_index(link) diff --git a/archivebox/index.py b/archivebox/index.py index 5ef86013..3f4ada3f 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -22,6 +22,7 @@ from util import ( check_link_structure, check_links_structure, wget_output_path, + latest_output, ) from parse import parse_links from links import validate_links @@ -168,8 +169,8 @@ def write_html_links_index(out_dir, links, finished=False): def patch_links_index(link, out_dir=OUTPUT_DIR): """hack to in-place update one row's info in the generated index html""" - title = link['latest']['title'] - successful = len([entry for entry in link['latest'].values() if entry]) + title = link['title'] or latest_output(link)['title'] + successful = len(tuple(filter(None, latest_output(link).values()))) # Patch JSON index changed = False @@ -177,7 +178,6 @@ def patch_links_index(link, out_dir=OUTPUT_DIR): for saved_link in json_file_links: if saved_link['url'] == link['url']: saved_link['title'] = title - saved_link['latest'] = link['latest'] saved_link['history'] = link['history'] changed = True break @@ -235,12 +235,10 @@ def load_json_link_index(out_dir, link): **link, } link.update({ - 'latest': link.get('latest') or {}, 'history': link.get('history') or {}, }) check_link_structure(link) - return link def write_html_link_index(out_dir, link): diff --git a/archivebox/links.py b/archivebox/links.py index 155b9372..b3e10356 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -9,12 +9,6 @@ Link { title: str, tags: str, sources: [str], - latest: { - ..., - pdf: 'output.pdf', - wget: 'example.com/1234/index.html', - screenshot: null, - }, history: { pdf: [ {start_ts, end_ts, duration, cmd, pwd, status, output}, @@ -30,7 +24,6 @@ from collections import OrderedDict from util import ( merge_links, - wget_output_path, check_link_structure, check_links_structure, ) @@ -47,30 +40,9 @@ def validate_links(links): raise SystemExit(1) for link in links: + link['title'] = unescape(link['title'].strip()) if link['title'].strip() else None check_link_structure(link) - link['title'] = unescape(link['title']) if link['title'] else None - link['latest'] = link.get('latest') or {} - - latest = link['latest'] - if not link['latest'].get('wget'): - link['latest']['wget'] = wget_output_path(link) - - if not link['latest'].get('pdf'): - link['latest']['pdf'] = None - - if not link['latest'].get('screenshot'): - link['latest']['screenshot'] = None - - if not link['latest'].get('dom'): - link['latest']['dom'] = None - - if not latest.get('favicon'): - latest['favicon'] = None - - if not link['latest'].get('title'): - link['latest']['title'] = link['title'] - return list(links) diff --git a/archivebox/util.py b/archivebox/util.py index 70b57336..9dc47540 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -118,12 +118,6 @@ def check_link_structure(link): assert isinstance(key, str) assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history']) - if 'latest' in link: - assert isinstance(link['latest'], dict), 'latest must be a Dict' - for key, val in link['latest'].items(): - assert isinstance(key, str) - assert (val is None) or isinstance(val, (str, Exception)), 'latest must be a Dict[str, Optional[str]], got: {}'.format(link['latest']) - def check_links_structure(links): """basic sanity check invariants to make sure the data is valid""" assert isinstance(links, list) @@ -304,10 +298,6 @@ def wget_output_path(link): See docs on wget --adjust-extension (-E) """ - # if we have it stored, always prefer the actual output path to computed one - if link.get('latest', {}).get('wget'): - return link['latest']['wget'] - if is_static_file(link['url']): return without_scheme(without_fragment(link['url'])) @@ -433,7 +423,7 @@ def derived_link_info(link): link['timestamp'], domain(url), )), - 'num_outputs': len([entry for entry in link['latest'].values() if entry]) if 'latest' in link else 0, + 'num_outputs': len([entry for entry in latest_output(link).values() if entry]), } # Archive Method Output URLs @@ -465,6 +455,35 @@ def derived_link_info(link): return extended_info +def latest_output(link, status=None): + """get the latest output that each archive method produced for link""" + + latest = { + 'title': None, + 'favicon': None, + 'wget': None, + 'warc': None, + 'pdf': None, + 'screenshot': None, + 'dom': None, + 'git': None, + 'media': None, + 'archive_org': None, + } + for archive_method in latest.keys(): + # get most recent succesful result in history for each archive method + history = link.get('history', {}).get(archive_method) or [] + history = filter(lambda result: result['output'], reversed(history)) + if status is not None: + history = filter(lambda result: result['status'] == status, history) + + history = list(history) + if history: + latest[archive_method] = history[0]['output'] + + return latest + + ### Python / System Helpers def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):