From d7981170810cfcaf12978226c00e3e95568a4659 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 21 Mar 2019 05:35:41 -0400 Subject: [PATCH] better separation of archive method checking and running logic --- archivebox/archive_methods.py | 473 ++++++++++++++++++---------------- archivebox/index.py | 16 +- archivebox/logs.py | 67 +++-- archivebox/parse.py | 8 +- archivebox/stdlib_patches.py | 167 ------------ archivebox/util.py | 172 +++++++++---- 6 files changed, 424 insertions(+), 479 deletions(-) delete mode 100644 archivebox/stdlib_patches.py diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index e3578e20..b403f637 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -3,7 +3,6 @@ import os from functools import wraps from collections import defaultdict from datetime import datetime -from stdlib_patches import run, PIPE, DEVNULL from index import ( write_link_index, @@ -43,16 +42,18 @@ from util import ( without_fragment, fetch_page_title, is_static_file, - progress, + TimedProgress, chmod_file, - check_link_structure, wget_output_path, chrome_args, + check_link_structure, + run, PIPE, DEVNULL ) from logs import ( _LAST_RUN_STATS, log_link_archiving_started, - log_link_archiving_failed, + log_archive_method_starting, + log_archive_method_finished, ) @@ -63,21 +64,20 @@ class ArchiveError(Exception): self.hints = hints -def archive_link(link_dir, link, overwrite=True): +def archive_link(link_dir, link): """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" ARCHIVE_METHODS = ( - (FETCH_TITLE, fetch_title), - (FETCH_FAVICON, fetch_favicon), - (FETCH_WGET, fetch_wget), - (FETCH_PDF, fetch_pdf), - (FETCH_SCREENSHOT, fetch_screenshot), - (FETCH_DOM, fetch_dom), - (FETCH_GIT, fetch_git), - (FETCH_MEDIA, fetch_media), - (SUBMIT_ARCHIVE_DOT_ORG, archive_dot_org), + ('title', should_fetch_title, fetch_title), + ('favicon', should_fetch_favicon, fetch_favicon), + ('wget', should_fetch_wget, fetch_wget), + ('pdf', should_fetch_pdf, fetch_pdf), + ('screenshot', should_fetch_screenshot, fetch_screenshot), + ('dom', should_fetch_dom, fetch_dom), + ('git', should_fetch_git, fetch_git), + ('media', should_fetch_media, fetch_media), + ('archive_org', should_fetch_archive_dot_org, archive_dot_org), ) - active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle] try: is_new = not os.path.exists(link_dir) @@ -87,109 +87,88 @@ def archive_link(link_dir, link, overwrite=True): link = load_json_link_index(link_dir, link) log_link_archiving_started(link_dir, link, is_new) - for archive_method in active_methods: - archive_method(link_dir, link, overwrite=overwrite) + for method_name, should_run, method_function in ARCHIVE_METHODS: + if method_name not in link['history']: + link['history'][method_name] = [] + if method_name not in link['latest']: + link['latest'][method_name] = None + if not should_run(link_dir, link): + continue + + log_archive_method_starting(method_name) + result = method_function(link_dir, link) + log_archive_method_finished(result) + + link['history'][method_name].append(result) + if result['status'] == 'succeeded': + link['latest'][method_name] = result['output'] + + _LAST_RUN_STATS[result['status']] += 1 write_link_index(link_dir, link) patch_links_index(link) except Exception as err: print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) + raise return link -def attach_result_to_link(method): - """ - Instead of returning a result={output:'...', status:'success'} object, - attach that result to the links's history & latest fields, then return - the updated link object. - """ - def decorator(fetch_func): - @wraps(fetch_func) - def timed_fetch_func(link_dir, link, overwrite=False, **kwargs): - # initialize methods and history json field on link - link['latest'] = link.get('latest') or {} - link['latest'][method] = link['latest'].get(method) or None - link['history'] = link.get('history') or {} - link['history'][method] = link['history'].get(method) or [] - start_ts = datetime.now().timestamp() +def should_fetch_title(link_dir, link): + # if link already has valid title, skip it + if link['title'] and not link['title'].lower().startswith('http'): + return False - # if a valid method output is already present, dont run the fetch function - if link['latest'][method] and not overwrite: - print(' √ {}'.format(method)) - result = None - else: - print(' > {}'.format(method)) - result = fetch_func(link_dir, link, **kwargs) + if is_static_file(link['url']): + return False - end_ts = datetime.now().timestamp() - duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0] + return FETCH_TITLE - # append a history item recording fail/success - history_entry = { - 'timestamp': str(start_ts).split('.')[0], - } - if result is None: - history_entry['status'] = 'skipped' - elif isinstance(result.get('output'), Exception): - history_entry['status'] = 'failed' - history_entry['duration'] = duration - history_entry.update(result or {}) - link['history'][method].append(history_entry) - else: - history_entry['status'] = 'succeded' - history_entry['duration'] = duration - history_entry.update(result or {}) - link['history'][method].append(history_entry) - link['latest'][method] = result['output'] - - _LAST_RUN_STATS[history_entry['status']] += 1 - - return link - return timed_fetch_func - return decorator - -@attach_result_to_link('title') def fetch_title(link_dir, link, timeout=TIMEOUT): """try to guess the page's title from its content""" - # if link already has valid title, skip it - if link['title'] and not link['title'].lower().startswith('http'): - return {'output': link['title'], 'status': 'skipped'} - - if is_static_file(link['url']): - return {'output': None, 'status': 'skipped'} - - end = progress(timeout, prefix=' ') + output = None + cmd = [ + CURL_BINARY, + link['url'], + '|', + 'grep', + '', + ] + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: - title = fetch_page_title(link['url'], timeout=timeout, progress=False) - end() - output = title - except Exception as e: - end() - output = e - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - - if title and title.strip(): - link['title'] = title - output = title + output = fetch_page_title(link['url'], timeout=timeout, progress=False) + if not output: + raise ArchiveError('Unable to detect page title') + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() return { - 'cmd': 'fetch_page_title("{}")'.format(link['url']), + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } -@attach_result_to_link('favicon') + +def should_fetch_favicon(link_dir, link): + if os.path.exists(os.path.join(link_dir, 'favicon.ico')): + return False + + return FETCH_FAVICON + def fetch_favicon(link_dir, link, timeout=TIMEOUT): """download site favicon from google's favicon api""" output = 'favicon.ico' - if os.path.exists(os.path.join(link_dir, output)): - return {'output': output, 'status': 'skipped'} - - CMD = [ + cmd = [ CURL_BINARY, '--max-time', str(timeout), '--location', @@ -197,37 +176,44 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT): *(() if CHECK_SSL_VALIDITY else ('--insecure',)), 'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])), ] - end = progress(timeout, prefix=' ') + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: - run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - end() + run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) chmod_file(output, cwd=link_dir) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() return { - 'cmd': CMD, + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } -@attach_result_to_link('wget') +def should_fetch_wget(link_dir, link): + output_path = wget_output_path(link) + if output_path and os.path.exists(os.path.join(link_dir, output_path)): + return False + + return FETCH_WGET + + def fetch_wget(link_dir, link, timeout=TIMEOUT): """download full site using wget""" - domain_dir = os.path.join(link_dir, domain(link['url'])) - existing_file = wget_output_path(link) - if os.path.exists(domain_dir) and existing_file: - return {'output': existing_file, 'status': 'skipped'} - if FETCH_WARC: warc_dir = os.path.join(link_dir, 'warc') os.makedirs(warc_dir, exist_ok=True) warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html - CMD = [ + output = None + cmd = [ WGET_BINARY, # '--server-response', # print headers for better error parsing '--no-verbose', @@ -248,20 +234,19 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))), link['url'], ] - end = progress(timeout, prefix=' ') + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - end() + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) output = wget_output_path(link) + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" output_tail = [ line.strip() for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip() ] - - # parse out number of files downloaded from last line of stderr: - # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" files_downloaded = ( int(output_tail[-1].strip().split(' ', 2)[1] or 0) if 'Downloaded:' in output_tail[-1] @@ -271,7 +256,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): # Check for common failure cases if result.returncode > 0 and files_downloaded < 1: hints = ( - 'Got wget response code {}:\n'.format(result.returncode), + 'Got wget response code: {}.'.format(result.returncode), *output_tail, ) if b'403: Forbidden' in result.stderr: @@ -281,144 +266,173 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): if b'ERROR 500: Internal Server Error' in result.stderr: raise ArchiveError('500 Internal Server Error', hints) raise ArchiveError('Got an error from the server', hints) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() return { - 'cmd': CMD, + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } -@attach_result_to_link('pdf') +def should_fetch_pdf(link_dir, link): + if is_static_file(link['url']): + return False + + if os.path.exists(os.path.join(link_dir, 'output.pdf')): + return False + + return FETCH_PDF + + def fetch_pdf(link_dir, link, timeout=TIMEOUT): """print PDF of site to file using chrome --headless""" - if is_static_file(link['url']): - return {'output': None, 'status': 'skipped'} - output = 'output.pdf' - if os.path.exists(os.path.join(link_dir, output)): - return {'output': output, 'status': 'skipped'} - - CMD = [ + cmd = [ *chrome_args(timeout=timeout), '--print-to-pdf', - link['url'] + link['url'], ] - end = progress(timeout, prefix=' ') - hints = None + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - end() + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) if result.returncode: hints = (result.stderr or result.stdout).decode() raise ArchiveError('Failed to print PDF', hints) chmod_file('output.pdf', cwd=link_dir) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e, hints=hints) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() return { - 'cmd': CMD, + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } -@attach_result_to_link('screenshot') +def should_fetch_screenshot(link_dir, link): + if is_static_file(link['url']): + return False + + if os.path.exists(os.path.join(link_dir, 'screenshot.png')): + return False + + return FETCH_SCREENSHOT + def fetch_screenshot(link_dir, link, timeout=TIMEOUT): """take screenshot of site using chrome --headless""" - if is_static_file(link['url']): - return {'output': None, 'status': 'skipped'} - output = 'screenshot.png' - if os.path.exists(os.path.join(link_dir, output)): - return {'output': output, 'status': 'skipped'} - - CMD = [ + cmd = [ *chrome_args(timeout=timeout), '--screenshot', link['url'], ] - end = progress(timeout, prefix=' ') + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - end() + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) + if result.returncode: hints = (result.stderr or result.stdout).decode() raise ArchiveError('Failed to take screenshot', hints) chmod_file(output, cwd=link_dir) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() return { - 'cmd': CMD, + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } + +def should_fetch_dom(link_dir, link): + if is_static_file(link['url']): + return False + + if os.path.exists(os.path.join(link_dir, 'output.html')): + return False + + return FETCH_DOM -@attach_result_to_link('dom') def fetch_dom(link_dir, link, timeout=TIMEOUT): """print HTML of site to file using chrome --dump-html""" - if is_static_file(link['url']): - return {'output': None, 'status': 'skipped'} - output = 'output.html' output_path = os.path.join(link_dir, output) - if os.path.exists(output_path): - return {'output': output, 'status': 'skipped'} - - CMD = [ + cmd = [ *chrome_args(timeout=timeout), '--dump-dom', link['url'] ] - end = progress(timeout, prefix=' ') + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: with open(output_path, 'w+') as f: - result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) - end() + result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) + if result.returncode: hints = result.stderr.decode() raise ArchiveError('Failed to fetch DOM', hints) chmod_file(output, cwd=link_dir) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() return { - 'cmd': CMD, + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } -@attach_result_to_link('git') -def fetch_git(link_dir, link, timeout=TIMEOUT): - """download full site using git""" +def should_fetch_git(link_dir, link): + if is_static_file(link['url']): + return False + + if os.path.exists(os.path.join(link_dir, 'git')): + return False is_clonable_url = ( domain(link['url']) in GIT_DOMAINS or extension(link['url']) == 'git' ) - if is_static_file(link['url']) or not is_clonable_url: - return {'output': None, 'status': 'skipped'} + if not is_clonable_url: + return False + + return FETCH_GIT + + +def fetch_git(link_dir, link, timeout=TIMEOUT): + """download full site using git""" output = 'git' output_path = os.path.join(link_dir, 'git') - - if os.path.exists(output_path): - return {'output': output, 'status': 'skipped'} - os.makedirs(output_path, exist_ok=True) - CMD = [ + cmd = [ GIT_BINARY, 'clone', '--mirror', @@ -426,39 +440,48 @@ def fetch_git(link_dir, link, timeout=TIMEOUT): *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')), without_query(without_fragment(link['url'])), ] - end = progress(timeout, prefix=' ') + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) - end() + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) if result.returncode == 128: # ignore failed re-download when the folder already exists pass elif result.returncode > 0: - hints = 'got git response code {}:'.format(result.returncode) + hints = 'Got git response code: {}.'.format(result.returncode) raise ArchiveError('Failed git download', hints) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) + + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() return { - 'cmd': CMD, + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } -@attach_result_to_link('media') -def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): +def should_fetch_media(link_dir, link): + if is_static_file(link['url']): + return False + + if os.path.exists(os.path.join(link_dir, 'media')): + return False + + return FETCH_MEDIA + +def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT): """Download playlists or individual video, audio, and subtitles using youtube-dl""" output = 'media' output_path = os.path.join(link_dir, 'media') - - if os.path.exists(output_path) and not overwrite: - return {'output': output, 'status': 'skipped'} - os.makedirs(output_path, exist_ok=True) - CMD = [ + cmd = [ YOUTUBEDL_BINARY, '--write-description', '--write-info-json', @@ -480,12 +503,11 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)), link['url'], ] - - end = progress(timeout, prefix=' ') + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) + result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) chmod_file(output, cwd=link_dir) - end() if result.returncode: if (b'ERROR: Unsupported URL' in result.stderr or b'HTTP Error 404' in result.stderr @@ -496,18 +518,22 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): pass else: hints = ( - 'got youtubedl response code {}:'.format(result.returncode), + 'Got youtube-dl response code: {}.'.format(result.returncode), *result.stderr.decode().split('\n'), ) raise ArchiveError('Failed to download media', hints) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() return { - 'cmd': CMD, + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } def parse_archive_dot_org_response(response): @@ -526,20 +552,23 @@ def parse_archive_dot_org_response(response): errors = headers['x-archive-wayback-runtime-error'] return content_location, errors -@attach_result_to_link('archive_org') +def should_fetch_archive_dot_org(link_dir, link): + if is_static_file(link['url']): + return False + + if os.path.exists(os.path.join(link_dir, 'archive.org.txt')): + # if open(path, 'r').read().strip() != 'None': + return False + + return SUBMIT_ARCHIVE_DOT_ORG + def archive_dot_org(link_dir, link, timeout=TIMEOUT): """submit site to archive.org for archiving via their service, save returned archive url""" output = 'archive.org.txt' archive_org_url = None - - path = os.path.join(link_dir, output) - if os.path.exists(path): - archive_org_url = open(path, 'r').read().strip() - return {'output': archive_org_url, 'status': 'skipped'} - submit_url = 'https://web.archive.org/save/{}'.format(link['url']) - CMD = [ + cmd = [ CURL_BINARY, '--location', '--head', @@ -548,10 +577,10 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): *(() if CHECK_SSL_VALIDITY else ('--insecure',)), submit_url, ] - end = progress(timeout, prefix=' ') + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout) - end() + result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout) content_location, errors = parse_archive_dot_org_response(result.stdout) if content_location: archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) @@ -562,10 +591,11 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): raise ArchiveError(', '.join(errors)) else: raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.') - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() if not isinstance(output, Exception): # instead of writing None when archive.org rejects the url write the @@ -579,8 +609,11 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): output = archive_org_url return { - 'cmd': CMD, + 'cmd': cmd, + 'pwd': link_dir, 'output': output, + 'status': status, + **timer.stats, } diff --git a/archivebox/index.py b/archivebox/index.py index f21146c2..83659644 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -17,6 +17,7 @@ from config import ( ) from util import ( chmod_file, + urlencode, derived_link_info, check_link_structure, check_links_structure, @@ -137,7 +138,7 @@ def write_html_links_index(out_dir, links, finished=False): os.path.join('archive', link['timestamp'], 'favicon.ico') # if link['is_archived'] else '' ), - 'archive_url': ( + 'archive_url': urlencode( wget_output_path(link) or 'index.html' ), }) @@ -174,6 +175,7 @@ def patch_links_index(link, out_dir=OUTPUT_DIR): if saved_link['url'] == link['url']: saved_link['title'] = title saved_link['latest'] = link['latest'] + saved_link['history'] = link['history'] changed = True break if changed: @@ -199,6 +201,7 @@ def write_link_index(out_dir, link): link['updated'] = str(datetime.now().timestamp()) write_json_link_index(out_dir, link) write_html_link_index(out_dir, link) + # print(' √ index.html, index.json') def write_json_link_index(out_dir, link): """write a json file with some info about the link""" @@ -206,8 +209,6 @@ def write_json_link_index(out_dir, link): check_link_structure(link) path = os.path.join(out_dir, 'index.json') - print(' √ index.json') - with open(path, 'w', encoding='utf-8') as f: json.dump(link, f, indent=4, default=str) @@ -231,8 +232,13 @@ def load_json_link_index(out_dir, link): **parse_json_link_index(out_dir), **link, } + link.update({ + 'latest': link.get('latest') or {}, + 'history': link.get('history') or {}, + }) check_link_structure(link) + return link def write_html_link_index(out_dir, link): @@ -242,8 +248,6 @@ def write_html_link_index(out_dir, link): path = os.path.join(out_dir, 'index.html') - print(' √ index.html') - link = derived_link_info(link) with open(path, 'w', encoding='utf-8') as f: @@ -253,7 +257,7 @@ def write_html_link_index(out_dir, link): link['title'] or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) ), - 'archive_url': ( + 'archive_url': urlencode( wget_output_path(link) or (link['domain'] if link['is_archived'] else 'about:blank') ), diff --git a/archivebox/logs.py b/archivebox/logs.py index a0eba2b0..b636ab1e 100644 --- a/archivebox/logs.py +++ b/archivebox/logs.py @@ -6,7 +6,7 @@ from config import ANSI, REPO_DIR, OUTPUT_DIR # globals are bad, mmkay _LAST_RUN_STATS = { 'skipped': 0, - 'succeded': 0, + 'succeeded': 0, 'failed': 0, 'parsing_start_ts': 0, @@ -38,41 +38,54 @@ def log_link_archiving_started(link_dir, link, is_new): print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else '')) -def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix=' '): +def log_archive_method_starting(method): + print(' > {}'.format(method)) + +def log_archive_method_finished(result): """quote the argument with whitespace in a command so the user can copy-paste the outputted string directly to run the cmd """ + required_keys = ('cmd', 'pwd', 'output', 'status', 'start_ts', 'end_ts') + assert ( + isinstance(result, dict) + and all(key in result for key in required_keys) + and ('output' in result) + ), 'Archive method did not return a valid result.' # Prettify CMD string and make it save to copy-paste by quoting arguments quoted_cmd = ' '.join( '"{}"'.format(arg) if ' ' in arg else arg - for arg in cmd + for arg in result['cmd'] ) - # Prettify error output hints string and limit to five lines - hints = hints or getattr(err, 'hints', None) - if hints: - hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') - hints = ( - ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) - for line in hints[:5] if line.strip() - ) - else: - hints = () + if result['status'] == 'failed': + # Prettify error output hints string and limit to five lines + hints = getattr(result['output'], 'hints', None) or () + if hints: + hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') + hints = ( + ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) + for line in hints[:5] if line.strip() + ) - output_lines = [ - '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']), - *hints, - 'Run to see full output:' - ' cd {};'.format(pwd), - ' {}'.format(quoted_cmd), - ] - - return '\n'.join( - '{}{}'.format(prefix, line) - for line in output_lines - if line - ) + # Collect and prefix output lines with indentation + output_lines = [ + '{}Failed:{} {}{}'.format( + ANSI['red'], + result['output'].__class__.__name__.replace('ArchiveError', ''), + result['output'], + ANSI['reset'] + ), + *hints, + '{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']), + ' cd {};'.format(result['pwd']), + ' {}'.format(quoted_cmd), + ] + print('\n'.join( + ' {}'.format(line) + for line in output_lines + if line + )) ### Logging Helpers @@ -102,7 +115,7 @@ def log_indexing_started(): def log_indexing_finished(out_dir, out_file): end_ts = datetime.now() _LAST_RUN_STATS['index_end_ts'] = end_ts - print(' > {}/{}'.format(pretty_path(out_dir), out_file)) + print(' √ {}/{}'.format(pretty_path(out_dir), out_file)) def log_archiving_started(num_links, resume): start_ts = datetime.now() diff --git a/archivebox/parse.py b/archivebox/parse.py index 1e9a23fb..fa7eaffe 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -28,7 +28,7 @@ from util import ( str_between, URL_REGEX, check_url_parsing_invariants, - progress, + TimedProgress, ) @@ -53,13 +53,13 @@ def parse_links(source_file): # Fallback parser ('Plain Text', parse_plain_text_export), ) - end = progress(TIMEOUT * 4, prefix=' ') + timer = TimedProgress(TIMEOUT * 4) with open(source_file, 'r', encoding='utf-8') as file: for parser_name, parser_func in PARSERS: try: links = list(parser_func(file)) if links: - end() + timer.end() return links, parser_name except Exception as err: # Parsers are tried one by one down the list, and the first one @@ -68,7 +68,7 @@ def parse_links(source_file): # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) pass - end() + timer.end() return [], 'Plain Text' diff --git a/archivebox/stdlib_patches.py b/archivebox/stdlib_patches.py deleted file mode 100644 index 5938f977..00000000 --- a/archivebox/stdlib_patches.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -Patches, additions, and shortcuts for Python standard library functions. -""" - -### subprocess - -from subprocess import ( - Popen, - PIPE, - DEVNULL, - CompletedProcess, - TimeoutExpired, - CalledProcessError, -) - -def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): - """Patched of subprocess.run to fix blocking io making timeout=innefective""" - - if input is not None: - if 'stdin' in kwargs: - raise ValueError('stdin and input arguments may not both be used.') - kwargs['stdin'] = PIPE - - if capture_output: - if ('stdout' in kwargs) or ('stderr' in kwargs): - raise ValueError('stdout and stderr arguments may not be used ' - 'with capture_output.') - kwargs['stdout'] = PIPE - kwargs['stderr'] = PIPE - - with Popen(*popenargs, **kwargs) as process: - try: - stdout, stderr = process.communicate(input, timeout=timeout) - except TimeoutExpired: - process.kill() - try: - stdout, stderr = process.communicate(input, timeout=2) - except: - pass - raise TimeoutExpired(popenargs[0][0], timeout) - except BaseException as err: - process.kill() - # We don't call process.wait() as .__exit__ does that for us. - raise - retcode = process.poll() - if check and retcode: - raise CalledProcessError(retcode, process.args, - output=stdout, stderr=stderr) - return CompletedProcess(process.args, retcode, stdout, stderr) - - - -### collections - -from sys import maxsize -from itertools import islice -from collections import deque - -_marker = object() - -class PeekableGenerator: - """Peekable version of a normal python generator. - Useful when you don't want to evaluate the entire iterable to look at - a specific item at a given idx. - """ - def __init__(self, iterable): - self._it = iter(iterable) - self._cache = deque() - - def __iter__(self): - return self - - def __bool__(self): - try: - self.peek() - except StopIteration: - return False - return True - - def __nonzero__(self): - # For Python 2 compatibility - return self.__bool__() - - def peek(self, default=_marker): - """Return the item that will be next returned from ``next()``. - Return ``default`` if there are no items left. If ``default`` is not - provided, raise ``StopIteration``. - """ - if not self._cache: - try: - self._cache.append(next(self._it)) - except StopIteration: - if default is _marker: - raise - return default - return self._cache[0] - - def prepend(self, *items): - """Stack up items to be the next ones returned from ``next()`` or - ``self.peek()``. The items will be returned in - first in, first out order:: - >>> p = peekable([1, 2, 3]) - >>> p.prepend(10, 11, 12) - >>> next(p) - 10 - >>> list(p) - [11, 12, 1, 2, 3] - It is possible, by prepending items, to "resurrect" a peekable that - previously raised ``StopIteration``. - >>> p = peekable([]) - >>> next(p) - Traceback (most recent call last): - ... - StopIteration - >>> p.prepend(1) - >>> next(p) - 1 - >>> next(p) - Traceback (most recent call last): - ... - StopIteration - """ - self._cache.extendleft(reversed(items)) - - def __next__(self): - if self._cache: - return self._cache.popleft() - - return next(self._it) - - def _get_slice(self, index): - # Normalize the slice's arguments - step = 1 if (index.step is None) else index.step - if step > 0: - start = 0 if (index.start is None) else index.start - stop = maxsize if (index.stop is None) else index.stop - elif step < 0: - start = -1 if (index.start is None) else index.start - stop = (-maxsize - 1) if (index.stop is None) else index.stop - else: - raise ValueError('slice step cannot be zero') - - # If either the start or stop index is negative, we'll need to cache - # the rest of the iterable in order to slice from the right side. - if (start < 0) or (stop < 0): - self._cache.extend(self._it) - # Otherwise we'll need to find the rightmost index and cache to that - # point. - else: - n = min(max(start, stop) + 1, maxsize) - cache_len = len(self._cache) - if n >= cache_len: - self._cache.extend(islice(self._it, n - cache_len)) - - return list(self._cache)[index] - - def __getitem__(self, index): - if isinstance(index, slice): - return self._get_slice(index) - - cache_len = len(self._cache) - if index < 0: - self._cache.extend(self._it) - elif index >= cache_len: - self._cache.extend(islice(self._it, index + 1 - cache_len)) - - return self._cache[index] diff --git a/archivebox/util.py b/archivebox/util.py index df934a80..7c6378af 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -8,12 +8,18 @@ from urllib.parse import urlparse, quote from decimal import Decimal from datetime import datetime from multiprocessing import Process +from subprocess import ( + Popen, + PIPE, + DEVNULL, + CompletedProcess, + TimeoutExpired, + CalledProcessError, +) -from stdlib_patches import run, PIPE, DEVNULL from config import ( ANSI, TERM_WIDTH, - REPO_DIR, SOURCES_DIR, ARCHIVE_DIR, OUTPUT_PERMISSIONS, @@ -43,6 +49,7 @@ from config import ( CHROME_HEADLESS, CHROME_SANDBOX, ) +from logs import pretty_path ### Parsing Helpers @@ -105,6 +112,17 @@ def check_link_structure(link): assert isinstance(link.get('url'), str) assert len(link['url']) > 2 assert len(re.findall(URL_REGEX, link['url'])) == 1 + if 'history' in link: + assert isinstance(link['history'], dict), 'history must be a Dict' + for key, val in link['history'].items(): + assert isinstance(key, str) + assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history']) + + if 'latest' in link: + assert isinstance(link['latest'], dict), 'latest must be a Dict' + for key, val in link['latest'].items(): + assert isinstance(key, str) + assert (val is None) or isinstance(val, (str, Exception)), 'latest must be a Dict[str, Optional[str]], got: {}'.format(link['latest']) def check_links_structure(links): """basic sanity check invariants to make sure the data is valid""" @@ -236,12 +254,12 @@ def save_remote_source(url, timeout=TIMEOUT): url, ANSI['reset'], )) - end = progress(TIMEOUT, prefix=' ') + timer = TimedProgress(timeout, prefix=' ') try: downloaded_xml = download_url(url, timeout=timeout) - end() + timer.end() except Exception as e: - end() + timer.end() print('{}[!] Failed to download {}{}\n'.format( ANSI['red'], url, @@ -291,9 +309,9 @@ def wget_output_path(link): return link['latest']['wget'] if is_static_file(link['url']): - return urlencode(without_scheme(without_fragment(link['url']))) + return without_scheme(without_fragment(link['url'])) - # Wget downloads can save in a number of different ways depending on the url + # Wget downloads can save in a number of different ways depending on the url: # https://example.com # > output/archive/<timestamp>/example.com/index.html # https://example.com/abc @@ -302,6 +320,10 @@ def wget_output_path(link): # > output/archive/<timestamp>/example.com/abc/index.html # https://example.com/abc/test.html # > output/archive/<timestamp>/example.com/abc/test.html + # https://example.com/abc/test?v=zzVa_tX1OiI + # > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html + # https://example.com/abc/test/?v=zzVa_tX1OiI + # > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html # There's also lots of complexity around how the urlencoding and renaming # is done for pages with query and hash fragments or extensions like shtml / htm @@ -327,7 +349,7 @@ def wget_output_path(link): ] if html_files: path_from_link_dir = search_dir.split(link_dir)[-1].strip('/') - return urlencode(os.path.join(path_from_link_dir, html_files[0])) + return os.path.join(path_from_link_dir, html_files[0]) # Move up one directory level search_dir = search_dir.rsplit('/', 1)[0] @@ -456,69 +478,109 @@ def derived_link_info(link): ### Python / System Helpers -def progress(seconds=TIMEOUT, prefix=''): - """Show a (subprocess-controlled) progress bar with a <seconds> timeout, - returns end() function to instantly finish the progress - """ +def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): + """Patched of subprocess.run to fix blocking io making timeout=innefective""" - if not SHOW_PROGRESS: - return lambda: None + if input is not None: + if 'stdin' in kwargs: + raise ValueError('stdin and input arguments may not both be used.') + kwargs['stdin'] = PIPE - def progress_bar(seconds, prefix): - """show timer in the form of progress bar, with percentage and seconds remaining""" - chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' - chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) + if capture_output: + if ('stdout' in kwargs) or ('stderr' in kwargs): + raise ValueError('stdout and stderr arguments may not be used ' + 'with capture_output.') + kwargs['stdout'] = PIPE + kwargs['stderr'] = PIPE + + with Popen(*popenargs, **kwargs) as process: try: - for s in range(seconds * chunks): - progress = s / chunks / seconds * 100 - bar_width = round(progress/(100/chunks)) + stdout, stderr = process.communicate(input, timeout=timeout) + except TimeoutExpired: + process.kill() + try: + stdout, stderr = process.communicate(input, timeout=2) + except: + pass + raise TimeoutExpired(popenargs[0][0], timeout) + except BaseException: + process.kill() + # We don't call process.wait() as .__exit__ does that for us. + raise + retcode = process.poll() + if check and retcode: + raise CalledProcessError(retcode, process.args, + output=stdout, stderr=stderr) + return CompletedProcess(process.args, retcode, stdout, stderr) - # ████████████████████ 0.9% (1/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( - prefix, - ANSI['green'], - (chunk * bar_width).ljust(chunks), - ANSI['reset'], - round(progress, 1), - round(s/chunks), - seconds, - )) - sys.stdout.flush() - time.sleep(1 / chunks) - # ██████████████████████████████████ 100.0% (60/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( +def progress_bar(seconds, prefix): + """show timer in the form of progress bar, with percentage and seconds remaining""" + chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' + chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) + try: + for s in range(seconds * chunks): + progress = s / chunks / seconds * 100 + bar_width = round(progress/(100/chunks)) + + # ████████████████████ 0.9% (1/60sec) + sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( prefix, - ANSI['red'], - chunk * chunks, + ANSI['green'], + (chunk * bar_width).ljust(chunks), ANSI['reset'], - 100.0, - seconds, + round(progress, 1), + round(s/chunks), seconds, )) sys.stdout.flush() - except KeyboardInterrupt: - print() - pass + time.sleep(1 / chunks) - p = Process(target=progress_bar, args=(seconds, prefix)) - p.start() + # ██████████████████████████████████ 100.0% (60/60sec) + sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( + prefix, + ANSI['red'], + chunk * chunks, + ANSI['reset'], + 100.0, + seconds, + seconds, + )) + sys.stdout.flush() + except KeyboardInterrupt: + print() + pass - def end(): +class TimedProgress: + def __init__(self, seconds, prefix=''): + if SHOW_PROGRESS: + self.p = Process(target=progress_bar, args=(seconds, prefix)) + self.p.start() + self.stats = { + 'start_ts': datetime.now(), + 'end_ts': None, + 'duration': None, + } + + def end(self): """immediately finish progress and clear the progressbar line""" - # protect from double termination - #if p is None or not hasattr(p, 'kill'): - # return - nonlocal p - if p is not None: - p.terminate() - p = None + end_ts = datetime.now() + self.stats.update({ + 'end_ts': end_ts, + 'duration': (end_ts - self.stats['start_ts']).seconds, + }) - sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line - sys.stdout.flush() + if SHOW_PROGRESS: + # protect from double termination + #if p is None or not hasattr(p, 'kill'): + # return + if self.p is not None: + self.p.terminate() + self.p = None - return end + sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line + sys.stdout.flush() def download_url(url, timeout=TIMEOUT): req = Request(url, headers={'User-Agent': WGET_USER_AGENT})