From bd9f3e313fb419eaa7025942cb522bb80b9b474d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 22 Mar 2019 15:09:39 -0400 Subject: [PATCH] better logging during long output --- archivebox/archive.py | 2 +- archivebox/archive_methods.py | 47 ++++++++++++++++------------------- archivebox/index.py | 5 +++- archivebox/links.py | 31 +++++++++-------------- archivebox/logs.py | 17 ++++++++++--- archivebox/util.py | 33 ++++++++---------------- 6 files changed, 63 insertions(+), 72 deletions(-) diff --git a/archivebox/archive.py b/archivebox/archive.py index 7c8fb939..e13d83c9 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -94,7 +94,7 @@ def main(*args): def update_archive_data(import_path=None, resume=None): - """The main ArchiveBox entrancepoint. Everything starts here.""" + """The main ArchiveBox entrancepoint. Everything starts here.""" check_dependencies() # Step 1: Load list of links from the existing index diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index c2b93e92..39d6448a 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -1,6 +1,5 @@ import os -from functools import wraps from collections import defaultdict from datetime import datetime @@ -50,10 +49,9 @@ from util import ( run, PIPE, DEVNULL ) from logs import ( - _LAST_RUN_STATS, log_link_archiving_started, log_link_archiving_finished, - log_archive_method_starting, + log_archive_method_started, log_archive_method_finished, ) @@ -94,6 +92,7 @@ def archive_link(link_dir, link): link['history'][method_name] = [] if method_name not in link['latest']: link['latest'][method_name] = None + if not should_run(link_dir, link): continue @@ -101,7 +100,7 @@ def archive_link(link_dir, link): skipped_entirely = False print() - log_archive_method_starting(method_name) + log_archive_method_started(method_name) result = method_function(link_dir, link) log_archive_method_finished(result) @@ -109,11 +108,6 @@ def archive_link(link_dir, link): if result['status'] == 'succeeded': link['latest'][method_name] = result['output'] - if result['status'] != 'skipped': - made_changes = True - - _LAST_RUN_STATS[result['status']] += 1 - write_link_index(link_dir, link) patch_links_index(link) @@ -126,6 +120,7 @@ def archive_link(link_dir, link): return link +### Archive Method Functions def should_fetch_title(link_dir, link): # if link already has valid title, skip it @@ -428,8 +423,8 @@ def should_fetch_git(link_dir, link): return False is_clonable_url = ( - domain(link['url']) in GIT_DOMAINS - or extension(link['url']) == 'git' + (domain(link['url']) in GIT_DOMAINS) + or (extension(link['url']) == 'git') ) if not is_clonable_url: return False @@ -477,6 +472,7 @@ def fetch_git(link_dir, link, timeout=TIMEOUT): **timer.stats, } + def should_fetch_media(link_dir, link): if is_static_file(link['url']): return False @@ -547,21 +543,6 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT): **timer.stats, } -def parse_archive_dot_org_response(response): - # Parse archive.org response headers - headers = defaultdict(list) - - # lowercase all the header names and store in dict - for header in response.splitlines(): - if b':' not in header or not header.strip(): - continue - name, val = header.decode().split(':', 1) - headers[name.lower().strip()].append(val.strip()) - - # Get successful archive url in "content-location" header or any errors - content_location = headers['content-location'] - errors = headers['x-archive-wayback-runtime-error'] - return content_location, errors def should_fetch_archive_dot_org(link_dir, link): if is_static_file(link['url']): @@ -627,4 +608,18 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): **timer.stats, } +def parse_archive_dot_org_response(response): + # Parse archive.org response headers + headers = defaultdict(list) + # lowercase all the header names and store in dict + for header in response.splitlines(): + if b':' not in header or not header.strip(): + continue + name, val = header.decode().split(':', 1) + headers[name.lower().strip()].append(val.strip()) + + # Get successful archive url in "content-location" header or any errors + content_location = headers['content-location'] + errors = headers['x-archive-wayback-runtime-error'] + return content_location, errors diff --git a/archivebox/index.py b/archivebox/index.py index 2b27a067..5ef86013 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -26,6 +26,7 @@ from util import ( from parse import parse_links from links import validate_links from logs import ( + log_indexing_process_started, log_indexing_started, log_indexing_finished, log_parsing_started, @@ -40,12 +41,14 @@ TITLE_LOADING_MSG = 'Not yet archived...' def write_links_index(out_dir, links, finished=False): """create index.html file for a given list of links""" - log_indexing_started() + log_indexing_process_started() check_links_structure(links) + log_indexing_started(out_dir, 'index.json') write_json_links_index(out_dir, links) log_indexing_finished(out_dir, 'index.json') + log_indexing_started(out_dir, 'index.html') write_html_links_index(out_dir, links, finished=finished) log_indexing_finished(out_dir, 'index.html') diff --git a/archivebox/links.py b/archivebox/links.py index a83333f3..155b9372 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -3,33 +3,26 @@ In ArchiveBox, a Link represents a single entry that we track in the json index. All links pass through all archiver functions and the latest, most up-to-date canonical output for each is stored in "latest". - Link { - timestamp: str, (how we uniquely id links) _ _ _ _ ___ - url: str, | \ / \ |\| ' | - base_url: str, |_/ \_/ | | | - domain: str, _ _ _ _ _ _ - tags: str, |_) /| |\| | / ` - type: str, | /"| | | | \_, - title: str, ,-'"`-. - sources: [str], /// / @ @ \ \\\\ - latest: { \ :=| ,._,. |=: / - ..., || ,\ \_../ /. || - pdf: 'output.pdf', ||','`-._))'`.`|| - wget: 'example.com/1234/index.html' `-' (/ `-' + timestamp: str, (how we uniquely id links) + url: str, + title: str, + tags: str, + sources: [str], + latest: { + ..., + pdf: 'output.pdf', + wget: 'example.com/1234/index.html', + screenshot: null, }, history: { - ... pdf: [ - {timestamp: 15444234325, status: 'skipped', result='output.pdf'}, + {start_ts, end_ts, duration, cmd, pwd, status, output}, ... ], - wget: [ - {timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'} - ] + ... }, } - """ from html import unescape diff --git a/archivebox/logs.py b/archivebox/logs.py index 8d87d032..36b92682 100644 --- a/archivebox/logs.py +++ b/archivebox/logs.py @@ -45,13 +45,21 @@ def log_link_archiving_started(link_dir, link, is_new): )) def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely): + if all(output == 'succeeded' for output in link['latest']): + _LAST_RUN_STATS['succeeded'] += 1 + elif skipped_entirely or all(output == 'skipped' for output in link['latest']): + _LAST_RUN_STATS['skipped'] += 1 + else: + _LAST_RUN_STATS['failed'] += 1 + # import ipdb; ipdb.set_trace() + if skipped_entirely: print('\r √ {}{}'.format( pretty_path(link_dir), ' (new)' if is_new else '', )) -def log_archive_method_starting(method): +def log_archive_method_started(method): print(' > {}'.format(method)) def log_archive_method_finished(result): @@ -117,7 +125,7 @@ def log_parsing_finished(num_new_links, parser_name): parser_name, )) -def log_indexing_started(): +def log_indexing_process_started(): start_ts = datetime.now() _LAST_RUN_STATS['index_start_ts'] = start_ts print('{green}[*] [{}] Saving main index files...{reset}'.format( @@ -125,10 +133,13 @@ def log_indexing_started(): **ANSI, )) +def log_indexing_started(out_dir, out_file): + sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file)) + def log_indexing_finished(out_dir, out_file): end_ts = datetime.now() _LAST_RUN_STATS['index_end_ts'] = end_ts - print(' √ {}/{}'.format(pretty_path(out_dir), out_file)) + print('\r √ {}/{}'.format(pretty_path(out_dir), out_file)) def log_archiving_started(num_links, resume): start_ts = datetime.now() diff --git a/archivebox/util.py b/archivebox/util.py index 7c6378af..e29f546c 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -314,10 +314,20 @@ def wget_output_path(link): # Wget downloads can save in a number of different ways depending on the url: # https://example.com # > output/archive//example.com/index.html + # https://example.com?v=zzVa_tX1OiI + # > output/archive//example.com/index.html?v=zzVa_tX1OiI.html + # https://www.example.com/?v=zzVa_tX1OiI + # > output/archive//example.com/index.html?v=zzVa_tX1OiI.html + # https://example.com/abc # > output/archive//example.com/abc.html # https://example.com/abc/ # > output/archive//example.com/abc/index.html + # https://example.com/abc?v=zzVa_tX1OiI.html + # > output/archive//example.com/abc?v=zzVa_tX1OiI.html + # https://example.com/abc/?v=zzVa_tX1OiI.html + # > output/archive//example.com/abc/index.html?v=zzVa_tX1OiI.html + # https://example.com/abc/test.html # > output/archive//example.com/abc/test.html # https://example.com/abc/test?v=zzVa_tX1OiI @@ -326,7 +336,7 @@ def wget_output_path(link): # > output/archive//example.com/abc/test/index.html?v=zzVa_tX1OiI.html # There's also lots of complexity around how the urlencoding and renaming - # is done for pages with query and hash fragments or extensions like shtml / htm + # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc # Since the wget algorithm for -E (appending .html) is incredibly complex # and there's no way to get the computed output path from wget @@ -359,27 +369,6 @@ def wget_output_path(link): return None - # If finding the actual output file didn't work, fall back to the buggy - # implementation of the wget .html appending algorithm - # split_url = link['url'].split('#', 1) - # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' - - # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): - # # already ends in .html - # return urlencode(base_url(link['url'])) - # else: - # # .html needs to be appended - # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] - # if without_scheme.endswith('/'): - # if query: - # return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])) - # return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]])) - # else: - # if query: - # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])) - # elif '/' in without_scheme: - # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]])) - # return urlencode(base_url(link['url']) + '/index.html') ### String Manipulation & Logging Helpers