1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

move latest to derived data using history

This commit is contained in:
Nick Sweeting 2019-03-22 21:38:08 -04:00
parent 69f837bbf6
commit d06775923b
4 changed files with 34 additions and 49 deletions

View file

@ -90,8 +90,6 @@ def archive_link(link_dir, link):
for method_name, should_run, method_function in ARCHIVE_METHODS: for method_name, should_run, method_function in ARCHIVE_METHODS:
if method_name not in link['history']: if method_name not in link['history']:
link['history'][method_name] = [] link['history'][method_name] = []
if method_name not in link['latest']:
link['latest'][method_name] = None
if not should_run(link_dir, link): if not should_run(link_dir, link):
continue continue
@ -105,8 +103,6 @@ def archive_link(link_dir, link):
log_archive_method_finished(result) log_archive_method_finished(result)
link['history'][method_name].append(result) link['history'][method_name].append(result)
if result['status'] == 'succeeded':
link['latest'][method_name] = result['output']
write_link_index(link_dir, link) write_link_index(link_dir, link)
patch_links_index(link) patch_links_index(link)

View file

@ -22,6 +22,7 @@ from util import (
check_link_structure, check_link_structure,
check_links_structure, check_links_structure,
wget_output_path, wget_output_path,
latest_output,
) )
from parse import parse_links from parse import parse_links
from links import validate_links from links import validate_links
@ -168,8 +169,8 @@ def write_html_links_index(out_dir, links, finished=False):
def patch_links_index(link, out_dir=OUTPUT_DIR): def patch_links_index(link, out_dir=OUTPUT_DIR):
"""hack to in-place update one row's info in the generated index html""" """hack to in-place update one row's info in the generated index html"""
title = link['latest']['title'] title = link['title'] or latest_output(link)['title']
successful = len([entry for entry in link['latest'].values() if entry]) successful = len(tuple(filter(None, latest_output(link).values())))
# Patch JSON index # Patch JSON index
changed = False changed = False
@ -177,7 +178,6 @@ def patch_links_index(link, out_dir=OUTPUT_DIR):
for saved_link in json_file_links: for saved_link in json_file_links:
if saved_link['url'] == link['url']: if saved_link['url'] == link['url']:
saved_link['title'] = title saved_link['title'] = title
saved_link['latest'] = link['latest']
saved_link['history'] = link['history'] saved_link['history'] = link['history']
changed = True changed = True
break break
@ -235,12 +235,10 @@ def load_json_link_index(out_dir, link):
**link, **link,
} }
link.update({ link.update({
'latest': link.get('latest') or {},
'history': link.get('history') or {}, 'history': link.get('history') or {},
}) })
check_link_structure(link) check_link_structure(link)
return link return link
def write_html_link_index(out_dir, link): def write_html_link_index(out_dir, link):

View file

@ -9,12 +9,6 @@ Link {
title: str, title: str,
tags: str, tags: str,
sources: [str], sources: [str],
latest: {
...,
pdf: 'output.pdf',
wget: 'example.com/1234/index.html',
screenshot: null,
},
history: { history: {
pdf: [ pdf: [
{start_ts, end_ts, duration, cmd, pwd, status, output}, {start_ts, end_ts, duration, cmd, pwd, status, output},
@ -30,7 +24,6 @@ from collections import OrderedDict
from util import ( from util import (
merge_links, merge_links,
wget_output_path,
check_link_structure, check_link_structure,
check_links_structure, check_links_structure,
) )
@ -47,30 +40,9 @@ def validate_links(links):
raise SystemExit(1) raise SystemExit(1)
for link in links: for link in links:
link['title'] = unescape(link['title'].strip()) if link['title'].strip() else None
check_link_structure(link) check_link_structure(link)
link['title'] = unescape(link['title']) if link['title'] else None
link['latest'] = link.get('latest') or {}
latest = link['latest']
if not link['latest'].get('wget'):
link['latest']['wget'] = wget_output_path(link)
if not link['latest'].get('pdf'):
link['latest']['pdf'] = None
if not link['latest'].get('screenshot'):
link['latest']['screenshot'] = None
if not link['latest'].get('dom'):
link['latest']['dom'] = None
if not latest.get('favicon'):
latest['favicon'] = None
if not link['latest'].get('title'):
link['latest']['title'] = link['title']
return list(links) return list(links)

View file

@ -118,12 +118,6 @@ def check_link_structure(link):
assert isinstance(key, str) assert isinstance(key, str)
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history']) assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
if 'latest' in link:
assert isinstance(link['latest'], dict), 'latest must be a Dict'
for key, val in link['latest'].items():
assert isinstance(key, str)
assert (val is None) or isinstance(val, (str, Exception)), 'latest must be a Dict[str, Optional[str]], got: {}'.format(link['latest'])
def check_links_structure(links): def check_links_structure(links):
"""basic sanity check invariants to make sure the data is valid""" """basic sanity check invariants to make sure the data is valid"""
assert isinstance(links, list) assert isinstance(links, list)
@ -304,10 +298,6 @@ def wget_output_path(link):
See docs on wget --adjust-extension (-E) See docs on wget --adjust-extension (-E)
""" """
# if we have it stored, always prefer the actual output path to computed one
if link.get('latest', {}).get('wget'):
return link['latest']['wget']
if is_static_file(link['url']): if is_static_file(link['url']):
return without_scheme(without_fragment(link['url'])) return without_scheme(without_fragment(link['url']))
@ -433,7 +423,7 @@ def derived_link_info(link):
link['timestamp'], link['timestamp'],
domain(url), domain(url),
)), )),
'num_outputs': len([entry for entry in link['latest'].values() if entry]) if 'latest' in link else 0, 'num_outputs': len([entry for entry in latest_output(link).values() if entry]),
} }
# Archive Method Output URLs # Archive Method Output URLs
@ -465,6 +455,35 @@ def derived_link_info(link):
return extended_info return extended_info
def latest_output(link, status=None):
"""get the latest output that each archive method produced for link"""
latest = {
'title': None,
'favicon': None,
'wget': None,
'warc': None,
'pdf': None,
'screenshot': None,
'dom': None,
'git': None,
'media': None,
'archive_org': None,
}
for archive_method in latest.keys():
# get most recent succesful result in history for each archive method
history = link.get('history', {}).get(archive_method) or []
history = filter(lambda result: result['output'], reversed(history))
if status is not None:
history = filter(lambda result: result['status'] == status, history)
history = list(history)
if history:
latest[archive_method] = history[0]['output']
return latest
### Python / System Helpers ### Python / System Helpers
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):