From e6bd1f8ca8eb18283e8979efcf9084c409ce1114 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 21 Mar 2019 01:28:12 -0400 Subject: [PATCH] major codebase-wide code cleanups --- archivebox/archive.py | 273 ++++------ archivebox/archive_methods.py | 502 ++++++++---------- archivebox/index.py | 90 +++- archivebox/logs.py | 161 ++++++ archivebox/parse.py | 188 +++---- archivebox/{peekable.py => stdlib_patches.py} | 58 +- archivebox/templates/link_index.html | 11 +- archivebox/util.py | 285 +++++----- 8 files changed, 825 insertions(+), 743 deletions(-) create mode 100644 archivebox/logs.py rename archivebox/{peekable.py => stdlib_patches.py} (68%) diff --git a/archivebox/archive.py b/archivebox/archive.py index 22c28b6b..7c8fb939 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -1,225 +1,132 @@ #!/usr/bin/env python3 -# ArchiveBox -# Nick Sweeting 2017 | MIT License -# https://github.com/pirate/ArchiveBox +""" +ArchiveBox command line application. + +./archive and ./bin/archivebox both point to this file, +but you can also run it directly using `python3 archive.py` + +Usage & Documentation: + https://github.com/pirate/ArchiveBox/Wiki +""" import os import sys -from datetime import datetime -from peekable import Peekable - - -from parse import parse_links -from links import validate_links, links_after_timestamp -from archive_methods import archive_link, _RESULTS_TOTALS -from index import ( - write_links_index, - parse_json_links_index, -) +from links import links_after_timestamp +from index import write_links_index, load_links_index +from archive_methods import archive_link from config import ( ARCHIVE_DIR, ONLY_NEW, OUTPUT_DIR, - REPO_DIR, - ANSI, GIT_SHA, ) from util import ( check_dependencies, save_remote_source, save_stdin_source, - pretty_path, - check_links_structure, +) +from logs import ( + log_archiving_started, + log_archiving_paused, + log_archiving_finished, ) __AUTHOR__ = 'Nick Sweeting ' __VERSION__ = GIT_SHA -__DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.' +__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.' __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' def print_help(): - print(__DESCRIPTION__) - print("Documentation: {}\n".format(__DOCUMENTATION__)) + print('ArchiveBox: The self-hosted internet archive.\n') + print("Documentation:") + print(" https://github.com/pirate/ArchiveBox/wiki\n") print("Usage:") - print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n") - print("") - print(" ./bin/archivebox https://example.com/feed.rss\n") - print("") print(" echo 'https://examplecom' | ./bin/archivebox\n") + print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n") + print(" ./bin/archivebox https://example.com/feed.rss\n") + print(" ./bin/archivebox 15109948213.123\n") -def load_links(archive_path=OUTPUT_DIR, import_path=None): - """get new links from file and optionally append them to links in existing archive""" - - existing_links = [] - if archive_path: - existing_links = parse_json_links_index(archive_path) - check_links_structure(existing_links) - - new_links = [] - if import_path: - # parse and validate the import file - raw_links, parser_name = parse_links(import_path) - new_links = validate_links(raw_links) - check_links_structure(new_links) - - # merge existing links in archive_path and new links - all_links = validate_links(existing_links + new_links) - check_links_structure(all_links) - num_new_links = len(all_links) - len(existing_links) - - if import_path and parser_name: - print(' > Adding {} new links to index (parsed import as {})'.format( - num_new_links, - parser_name, - )) - - return all_links, new_links - - -def update_archive(archive_path, links, source=None, resume=None, append=True): - """update or create index.html+json given a path to an export file containing new links""" - - start_ts = datetime.now().timestamp() - - if resume: - print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - resume, - **ANSI, - )) - else: - print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - len(links), - **ANSI, - )) - - check_links_structure(links) - - # prefetch the first link off the generator so that if we pause or fail - # immediately we can show that we paused on the first link and not just None - to_archive = Peekable(links_after_timestamp(links, resume)) - idx, link = 0, to_archive.peek(0) - - # loop over links and archive them - try: - check_dependencies() - for idx, link in enumerate(to_archive): - link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) - archive_link(link_dir, link) - - except (KeyboardInterrupt, SystemExit, Exception) as e: - # if isinstance(e, KeyboardInterrupt): - # # Step 4: Re-write links index with updated titles, icons, and resources - # all_links, _ = load_links(archive_path=out_dir) - # write_links_index(out_dir=out_dir, links=all_links, finished=True) - print() - print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format( - **ANSI, - now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - idx=idx+1, - timestamp=link['timestamp'], - total=len(links), - )) - print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) - print(' Continue where you left off by running:') - print(' {} {}'.format( - pretty_path(sys.argv[0]), - link['timestamp'], - )) - if not isinstance(e, KeyboardInterrupt): - print() - raise e - raise SystemExit(1) - - # print timing information & summary - end_ts = datetime.now().timestamp() - seconds = end_ts - start_ts - if seconds > 60: - duration = '{0:.2f} min'.format(seconds / 60, 2) - else: - duration = '{0:.2f} sec'.format(seconds, 2) - - print('{}[√] [{}] Update of {} pages complete ({}){}'.format( - ANSI['green'], - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - len(links), - duration, - ANSI['reset'], - )) - print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped'])) - print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded'])) - print(' - {} errors'.format(_RESULTS_TOTALS['failed'])) - print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) - - -if __name__ == '__main__': - argc = len(sys.argv) - - if set(sys.argv).intersection(('-h', '--help', 'help')): +def main(*args): + if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2: print_help() raise SystemExit(0) - source = sys.argv[1] if argc > 1 else None # path of links file to import - resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from - - stdin_raw_text = '' + ### Handle CLI arguments + # ./archive bookmarks.html + # ./archive 1523422111.234 + import_path, resume = None, None + if len(args) == 2: + # if the argument is a string, it's a import_path file to import + # if it's a number, it's a timestamp to resume archiving from + if args[1].replace('.', '').isdigit(): + import_path, resume = None, args[1] + else: + import_path, resume = args[1], None + ### Set up output folder + if not os.path.exists(OUTPUT_DIR): + os.makedirs(OUTPUT_DIR) + + ### Handle ingesting urls piped in through stdin + # (.e.g if user does cat example_urls.txt | ./archive) if not sys.stdin.isatty(): stdin_raw_text = sys.stdin.read() + if stdin_raw_text and import_path: + print( + '[X] You should pass either a path as an argument, ' + 'or pass a list of links via stdin, but not both.\n' + ) + print_help() + raise SystemExit(1) - if source and stdin_raw_text: - print( - '[X] You should pass either a path as an argument, ' - 'or pass a list of links via stdin, but not both.\n' - ) - print_help() - raise SystemExit(1) + import_path = save_stdin_source(stdin_raw_text) + + ### Handle ingesting urls from a remote file/feed + # (e.g. if an RSS feed URL is used as the import path) + if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')): + import_path = save_remote_source(import_path) + + ### Run the main archive update process + update_archive_data(import_path=import_path, resume=resume) - if argc == 1: - source, resume = None, None - elif argc == 2: - if all(d.isdigit() for d in sys.argv[1].split('.')): - # argv[1] is a resume timestamp - source, resume = None, sys.argv[1] - else: - # argv[1] is a path to a file to import - source, resume = sys.argv[1].strip(), None - elif argc == 3: - source, resume = sys.argv[1].strip(), sys.argv[2] - else: - print_help() - raise SystemExit(1) +def update_archive_data(import_path=None, resume=None): + """The main ArchiveBox entrancepoint. Everything starts here.""" + check_dependencies() - # See if archive folder already exists - for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'): - if os.path.exists(out_dir): - break - else: - out_dir = OUTPUT_DIR + # Step 1: Load list of links from the existing index + # merge in and dedupe new links from import_path + all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) - # Step 0: Download url to local file (only happens if a URL is specified instead of local path) - if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')): - source = save_remote_source(source) - elif stdin_raw_text: - source = save_stdin_source(stdin_raw_text) - - # Step 1: Parse the links and dedupe them with existing archive - all_links, new_links = load_links(archive_path=out_dir, import_path=source) - - # Step 2: Write new index - write_links_index(out_dir=out_dir, links=all_links) + # Step 2: Write updated index with deduped old and new links back to disk + write_links_index(out_dir=OUTPUT_DIR, links=all_links) # Step 3: Run the archive methods for each link - if ONLY_NEW: - update_archive(out_dir, new_links, source=source, resume=resume, append=True) - else: - update_archive(out_dir, all_links, source=source, resume=resume, append=True) + links = new_links if ONLY_NEW else all_links + log_archiving_started(len(links), resume) + idx, link = 0, 0 + try: + for idx, link in enumerate(links_after_timestamp(links, resume)): + link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) + archive_link(link_dir, link) + + except KeyboardInterrupt: + log_archiving_paused(len(links), idx, link and link['timestamp']) + raise SystemExit(0) + + except: + print() + raise + + log_archiving_finished(len(links)) # Step 4: Re-write links index with updated titles, icons, and resources - all_links, _ = load_links(archive_path=out_dir) - write_links_index(out_dir=out_dir, links=all_links, finished=True) + all_links, _ = load_links_index(out_dir=OUTPUT_DIR) + write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True) + + +if __name__ == '__main__': + main(*sys.argv) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 2efc0700..e3578e20 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -3,18 +3,18 @@ import os from functools import wraps from collections import defaultdict from datetime import datetime +from stdlib_patches import run, PIPE, DEVNULL from index import ( - parse_json_link_index, write_link_index, - update_main_index, + patch_links_index, + load_json_link_index, ) from config import ( CURL_BINARY, GIT_BINARY, WGET_BINARY, YOUTUBEDL_BINARY, - CHROME_BINARY, FETCH_FAVICON, FETCH_TITLE, FETCH_WGET, @@ -25,62 +25,37 @@ from config import ( FETCH_WARC, FETCH_GIT, FETCH_MEDIA, - RESOLUTION, - CHECK_SSL_VALIDITY, SUBMIT_ARCHIVE_DOT_ORG, - COOKIES_FILE, - WGET_USER_AGENT, - CHROME_USER_AGENT, - CHROME_USER_DATA_DIR, - CHROME_HEADLESS, - CHROME_SANDBOX, TIMEOUT, MEDIA_TIMEOUT, ANSI, - ARCHIVE_DIR, + OUTPUT_DIR, GIT_DOMAINS, GIT_SHA, + WGET_USER_AGENT, + CHECK_SSL_VALIDITY, + COOKIES_FILE, ) from util import ( domain, + extension, without_query, without_fragment, fetch_page_title, is_static_file, progress, chmod_file, - pretty_path, - print_error_hints, check_link_structure, wget_output_path, - run, PIPE, DEVNULL, + chrome_args, +) +from logs import ( + _LAST_RUN_STATS, + log_link_archiving_started, + log_link_archiving_failed, ) -_RESULTS_TOTALS = { # globals are bad, mmkay - 'skipped': 0, - 'succeded': 0, - 'failed': 0, -} - -def load_link_index(link_dir, link): - """check for an existing link archive in the given directory, - and load+merge it into the given link dict - """ - is_new = not os.path.exists(link_dir) - if is_new: - os.makedirs(link_dir) - else: - link = { - **parse_json_link_index(link_dir), - **link, - } - - check_link_structure(link) - print_link_status_line(link_dir, link, is_new) - - return link - class ArchiveError(Exception): def __init__(self, message, hints=None): @@ -105,32 +80,24 @@ def archive_link(link_dir, link, overwrite=True): active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle] try: - link = load_link_index(link_dir, link) + is_new = not os.path.exists(link_dir) + if is_new: + os.makedirs(link_dir) + + link = load_json_link_index(link_dir, link) + log_link_archiving_started(link_dir, link, is_new) for archive_method in active_methods: archive_method(link_dir, link, overwrite=overwrite) - write_link_index(link_dir, link) - update_main_index(link) + patch_links_index(link) except Exception as err: print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) return link -def print_link_status_line(link_dir, link, is_new): - print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format( - symbol='+' if is_new else '*', - symbol_color=ANSI['green' if is_new else 'black'], - now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - **{**link, 'title': link['title'] or link['url']}, - **ANSI, - )) - - print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else '')) - - def attach_result_to_link(method): """ @@ -178,15 +145,75 @@ def attach_result_to_link(method): link['history'][method].append(history_entry) link['latest'][method] = result['output'] - _RESULTS_TOTALS[history_entry['status']] += 1 + _LAST_RUN_STATS[history_entry['status']] += 1 return link return timed_fetch_func return decorator +@attach_result_to_link('title') +def fetch_title(link_dir, link, timeout=TIMEOUT): + """try to guess the page's title from its content""" + + # if link already has valid title, skip it + if link['title'] and not link['title'].lower().startswith('http'): + return {'output': link['title'], 'status': 'skipped'} + + if is_static_file(link['url']): + return {'output': None, 'status': 'skipped'} + + end = progress(timeout, prefix=' ') + try: + title = fetch_page_title(link['url'], timeout=timeout, progress=False) + end() + output = title + except Exception as e: + end() + output = e + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + + if title and title.strip(): + link['title'] = title + output = title + + return { + 'cmd': 'fetch_page_title("{}")'.format(link['url']), + 'output': output, + } + +@attach_result_to_link('favicon') +def fetch_favicon(link_dir, link, timeout=TIMEOUT): + """download site favicon from google's favicon api""" + + output = 'favicon.ico' + if os.path.exists(os.path.join(link_dir, output)): + return {'output': output, 'status': 'skipped'} + + CMD = [ + CURL_BINARY, + '--max-time', str(timeout), + '--location', + '--output', output, + *(() if CHECK_SSL_VALIDITY else ('--insecure',)), + 'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])), + ] + end = progress(timeout, prefix=' ') + try: + run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) + end() + chmod_file(output, cwd=link_dir) + except Exception as e: + end() + output = e + print_error_hints(cmd=CMD, pwd=link_dir, err=e) + + return { + 'cmd': CMD, + 'output': output, + } @attach_result_to_link('wget') -def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT): +def fetch_wget(link_dir, link, timeout=TIMEOUT): """download full site using wget""" domain_dir = os.path.join(link_dir, domain(link['url'])) @@ -194,7 +221,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC if os.path.exists(domain_dir) and existing_file: return {'output': existing_file, 'status': 'skipped'} - if warc: + if FETCH_WARC: warc_dir = os.path.join(link_dir, 'warc') os.makedirs(warc_dir, exist_ok=True) warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) @@ -213,8 +240,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC '-e', 'robots=off', '--restrict-file-names=unix', '--timeout={}'.format(timeout), - *(() if warc else ('--timestamping',)), - *(('--warc-file={}'.format(warc_path),) if warc else ()), + *(() if FETCH_WARC else ('--timestamping',)), + *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()), *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()), @@ -233,7 +260,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC if line.strip() ] - # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" files_downloaded = ( int(output_tail[-1].strip().split(' ', 2)[1] or 0) if 'Downloaded:' in output_tail[-1] @@ -263,20 +291,19 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC 'output': output, } - @attach_result_to_link('pdf') -def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): +def fetch_pdf(link_dir, link, timeout=TIMEOUT): """print PDF of site to file using chrome --headless""" if is_static_file(link['url']): - return {'output': wget_output_path(link), 'status': 'skipped'} + return {'output': None, 'status': 'skipped'} output = 'output.pdf' if os.path.exists(os.path.join(link_dir, output)): return {'output': output, 'status': 'skipped'} CMD = [ - *chrome_headless(timeout=timeout, **chrome_kwargs), + *chrome_args(timeout=timeout), '--print-to-pdf', link['url'] ] @@ -302,18 +329,18 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): } @attach_result_to_link('screenshot') -def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): +def fetch_screenshot(link_dir, link, timeout=TIMEOUT): """take screenshot of site using chrome --headless""" if is_static_file(link['url']): - return {'output': wget_output_path(link), 'status': 'skipped'} + return {'output': None, 'status': 'skipped'} output = 'screenshot.png' if os.path.exists(os.path.join(link_dir, output)): return {'output': output, 'status': 'skipped'} CMD = [ - *chrome_headless(timeout=timeout, **chrome_kwargs), + *chrome_args(timeout=timeout), '--screenshot', link['url'], ] @@ -337,18 +364,19 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): } @attach_result_to_link('dom') -def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): +def fetch_dom(link_dir, link, timeout=TIMEOUT): """print HTML of site to file using chrome --dump-html""" if is_static_file(link['url']): - return {'output': wget_output_path(link), 'status': 'skipped'} + return {'output': None, 'status': 'skipped'} output = 'output.html' - if os.path.exists(os.path.join(link_dir, output)): + output_path = os.path.join(link_dir, output) + if os.path.exists(output_path): return {'output': output, 'status': 'skipped'} CMD = [ - *chrome_headless(timeout=timeout, **chrome_kwargs), + *chrome_args(timeout=timeout), '--dump-dom', link['url'] ] @@ -372,6 +400,116 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs): 'output': output, } +@attach_result_to_link('git') +def fetch_git(link_dir, link, timeout=TIMEOUT): + """download full site using git""" + + is_clonable_url = ( + domain(link['url']) in GIT_DOMAINS + or extension(link['url']) == 'git' + ) + if is_static_file(link['url']) or not is_clonable_url: + return {'output': None, 'status': 'skipped'} + + output = 'git' + output_path = os.path.join(link_dir, 'git') + + if os.path.exists(output_path): + return {'output': output, 'status': 'skipped'} + + os.makedirs(output_path, exist_ok=True) + CMD = [ + GIT_BINARY, + 'clone', + '--mirror', + '--recursive', + *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')), + without_query(without_fragment(link['url'])), + ] + end = progress(timeout, prefix=' ') + try: + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) + end() + + if result.returncode == 128: + # ignore failed re-download when the folder already exists + pass + elif result.returncode > 0: + hints = 'got git response code {}:'.format(result.returncode) + raise ArchiveError('Failed git download', hints) + except Exception as e: + end() + output = e + print_error_hints(cmd=CMD, pwd=link_dir, err=e) + + return { + 'cmd': CMD, + 'output': output, + } + +@attach_result_to_link('media') +def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): + """Download playlists or individual video, audio, and subtitles using youtube-dl""" + + output = 'media' + output_path = os.path.join(link_dir, 'media') + + if os.path.exists(output_path) and not overwrite: + return {'output': output, 'status': 'skipped'} + + os.makedirs(output_path, exist_ok=True) + CMD = [ + YOUTUBEDL_BINARY, + '--write-description', + '--write-info-json', + '--write-annotations', + '--yes-playlist', + '--write-thumbnail', + '--no-call-home', + '--no-check-certificate', + '--user-agent', + '--all-subs', + '--extract-audio', + '--keep-video', + '--ignore-errors', + '--geo-bypass', + '--audio-format', 'mp3', + '--audio-quality', '320K', + '--embed-thumbnail', + '--add-metadata', + *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)), + link['url'], + ] + + end = progress(timeout, prefix=' ') + try: + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) + chmod_file(output, cwd=link_dir) + end() + if result.returncode: + if (b'ERROR: Unsupported URL' in result.stderr + or b'HTTP Error 404' in result.stderr + or b'HTTP Error 403' in result.stderr + or b'URL could be a direct video link' in result.stderr + or b'Unable to extract container ID' in result.stderr): + # These happen too frequently on non-media pages to warrant printing to console + pass + else: + hints = ( + 'got youtubedl response code {}:'.format(result.returncode), + *result.stderr.decode().split('\n'), + ) + raise ArchiveError('Failed to download media', hints) + except Exception as e: + end() + output = e + print_error_hints(cmd=CMD, pwd=link_dir, err=e) + + return { + 'cmd': CMD, + 'output': output, + } + def parse_archive_dot_org_response(response): # Parse archive.org response headers headers = defaultdict(list) @@ -445,226 +583,4 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): 'output': output, } -@attach_result_to_link('favicon') -def fetch_favicon(link_dir, link, timeout=TIMEOUT): - """download site favicon from google's favicon api""" - output = 'favicon.ico' - if os.path.exists(os.path.join(link_dir, output)): - return {'output': output, 'status': 'skipped'} - - CMD = [ - CURL_BINARY, - '--max-time', str(timeout), - '--location', - '--output', output, - *(() if CHECK_SSL_VALIDITY else ('--insecure',)), - 'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])), - ] - end = progress(timeout, prefix=' ') - try: - run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - end() - chmod_file(output, cwd=link_dir) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) - - return { - 'cmd': CMD, - 'output': output, - } - -@attach_result_to_link('title') -def fetch_title(link_dir, link, timeout=TIMEOUT): - """try to guess the page's title from its content""" - - # if link already has valid title, skip it - if link['title'] and not link['title'].lower().startswith('http'): - return {'output': link['title'], 'status': 'skipped'} - - if is_static_file(link['url']): - return {'output': None, 'status': 'skipped'} - - end = progress(timeout, prefix=' ') - try: - title = fetch_page_title(link['url'], timeout=timeout, progress=False) - end() - output = title - except Exception as e: - end() - output = e - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - - if title and title.strip(): - link['title'] = title - output = title - - return { - 'cmd': 'fetch_page_title("{}")'.format(link['url']), - 'output': output, - } - -@attach_result_to_link('media') -def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): - """Download playlists or individual video, audio, and subtitles using youtube-dl""" - - output = 'media' - output_path = os.path.join(link_dir, 'media') - - if os.path.exists(output_path) and not overwrite: - return {'output': output, 'status': 'skipped'} - - os.makedirs(output_path, exist_ok=True) - CMD = [ - YOUTUBEDL_BINARY, - '--write-description', - '--write-info-json', - '--write-annotations', - '--yes-playlist', - '--write-thumbnail', - '--no-call-home', - '--no-check-certificate', - '--user-agent', - '--all-subs', - '--extract-audio', - '--keep-video', - '--ignore-errors', - '--geo-bypass', - '--audio-format', 'mp3', - '--audio-quality', '320K', - '--embed-thumbnail', - '--add-metadata', - *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)), - link['url'], - ] - - end = progress(timeout, prefix=' ') - try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) - chmod_file(output, cwd=link_dir) - end() - if result.returncode: - if (b'ERROR: Unsupported URL' in result.stderr - or b'HTTP Error 404' in result.stderr - or b'HTTP Error 403' in result.stderr - or b'URL could be a direct video link' in result.stderr - or b'Unable to extract container ID' in result.stderr): - # These happen too frequently on non-media pages to warrant printing to console - pass - else: - hints = ( - 'got youtubedl response code {}:'.format(result.returncode), - *result.stderr.decode().split('\n'), - ) - raise ArchiveError('Failed to download media', hints) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) - - return { - 'cmd': CMD, - 'output': output, - } - - -@attach_result_to_link('git') -def fetch_git(link_dir, link, timeout=TIMEOUT): - """download full site using git""" - - url_is_clonable = ( - domain(link['url']) in GIT_DOMAINS - or link['url'].endswith('.git') - ) - if not url_is_clonable or is_static_file(link['url']): - return {'output': None, 'status': 'skipped'} - - output = 'git' - output_path = os.path.join(link_dir, 'git') - - if os.path.exists(output_path): - return {'output': output, 'status': 'skipped'} - - os.makedirs(output_path, exist_ok=True) - CMD = [ - GIT_BINARY, - 'clone', - '--mirror', - '--recursive', - *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')), - without_query(without_fragment(link['url'])), - ] - end = progress(timeout, prefix=' ') - try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) - end() - - if result.returncode == 128: - # ignore failed re-download when the folder already exists - pass - elif result.returncode > 0: - hints = 'got git response code {}:'.format(result.returncode) - raise ArchiveError('Failed git download', hints) - except Exception as e: - end() - output = e - print_error_hints(cmd=CMD, pwd=link_dir, err=e) - - return { - 'cmd': CMD, - 'output': output, - } - -def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, resolution=RESOLUTION, timeout=TIMEOUT): - global CACHED_USER_DATA_DIR - user_data_dir = user_data_dir or CACHED_USER_DATA_DIR - cmd_args = [binary] - - if headless: - cmd_args += ('--headless',) - - if not sandbox: - # dont use GPU or sandbox when running inside docker container - cmd_args += ('--no-sandbox', '--disable-gpu') - - if not check_ssl_validity: - cmd_args += ('--disable-web-security', '--ignore-certificate-errors') - - if user_agent: - cmd_args += ('--user-agent={}'.format(user_agent),) - - if resolution: - cmd_args += ('--window-size={}'.format(RESOLUTION),) - - if timeout: - cmd_args += ('--timeout={}'.format((timeout) * 1000),) - - # Find chrome user data directory - default_profile_paths = ( - '~/.config/chromium', - '~/.config/google-chrome', - '~/.config/google-chrome-beta', - '~/.config/google-chrome-unstable', - '~/Library/Application Support/Chromium', - '~/Library/Application Support/Google/Chrome', - '~/Library/Application Support/Google/Chrome Canary', - '~/AppData/Local/Chromium/User Data', - '~/AppData/Local/Google/Chrome/User Data', - '~/AppData/Local/Google/Chrome SxS/User Data', - ) - if user_data_dir: - cmd_args.append('--user-data-dir={}'.format(user_data_dir)) - else: - for path in default_profile_paths: - full_path = os.path.expanduser(path) - if os.path.exists(full_path): - CACHED_USER_DATA_DIR = full_path - cmd_args.append('--user-data-dir={}'.format(full_path)) - break - - return cmd_args - - -CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR diff --git a/archivebox/index.py b/archivebox/index.py index 694ea1dc..f21146c2 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -12,18 +12,24 @@ except ImportError: from config import ( OUTPUT_DIR, TEMPLATES_DIR, - ANSI, GIT_SHA, FOOTER_INFO, ) from util import ( chmod_file, derived_link_info, - pretty_path, check_link_structure, check_links_structure, wget_output_path, ) +from parse import parse_links +from links import validate_links +from logs import ( + log_indexing_started, + log_indexing_finished, + log_parsing_started, + log_parsing_finished, +) TITLE_LOADING_MSG = 'Not yet archived...' @@ -33,21 +39,40 @@ TITLE_LOADING_MSG = 'Not yet archived...' def write_links_index(out_dir, links, finished=False): """create index.html file for a given list of links""" + log_indexing_started() check_links_structure(links) - if not os.path.exists(out_dir): - os.makedirs(out_dir) - - print('{green}[*] [{}] Saving main index files...{reset}'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - **ANSI, - )) write_json_links_index(out_dir, links) - print(' > {}/index.json'.format(pretty_path(out_dir))) + log_indexing_finished(out_dir, 'index.json') write_html_links_index(out_dir, links, finished=finished) - print(' > {}/index.html'.format(pretty_path(out_dir))) + log_indexing_finished(out_dir, 'index.html') +def load_links_index(out_dir=OUTPUT_DIR, import_path=None): + """parse and load existing index with any new links from import_path merged in""" + + existing_links = [] + if out_dir: + existing_links = parse_json_links_index(out_dir) + check_links_structure(existing_links) + + new_links = [] + if import_path: + # parse and validate the import file + log_parsing_started(import_path) + raw_links, parser_name = parse_links(import_path) + new_links = validate_links(raw_links) + check_links_structure(new_links) + + # merge existing links in out_dir and new links + all_links = validate_links(existing_links + new_links) + check_links_structure(all_links) + num_new_links = len(all_links) - len(existing_links) + + if import_path and parser_name: + log_parsing_finished(num_new_links, parser_name) + + return all_links, new_links def write_json_links_index(out_dir, links): """write the json link index to a given path""" @@ -70,8 +95,8 @@ def write_json_links_index(out_dir, links): chmod_file(path) -def parse_json_links_index(out_dir): - """load the index in a given directory and merge it with the given link""" +def parse_json_links_index(out_dir=OUTPUT_DIR): + """parse a archive index json file and return the list of links""" index_path = os.path.join(out_dir, 'index.json') if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: @@ -136,31 +161,26 @@ def write_html_links_index(out_dir, links, finished=False): chmod_file(path) -def update_main_index(link): +def patch_links_index(link, out_dir=OUTPUT_DIR): """hack to in-place update one row's info in the generated index html""" title = link['latest']['title'] successful = len([entry for entry in link['latest'].values() if entry]) # Patch JSON index - json_path = os.path.join(OUTPUT_DIR, 'index.json') - - links = parse_json_links_index(OUTPUT_DIR) - changed = False - for json_link in links: - if json_link['url'] == link['url']: - json_link['title'] = title - json_link['latest'] = link['latest'] + json_file_links = parse_json_links_index(out_dir) + for saved_link in json_file_links: + if saved_link['url'] == link['url']: + saved_link['title'] = title + saved_link['latest'] = link['latest'] changed = True break - if changed: - write_json_links_index(OUTPUT_DIR, links) + write_json_links_index(out_dir, json_file_links) # Patch HTML index - html_path = os.path.join(OUTPUT_DIR, 'index.html') - + html_path = os.path.join(out_dir, 'index.html') html = open(html_path, 'r').read().split('\n') for idx, line in enumerate(html): if title and (' {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else '')) + + +def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix=' '): + """quote the argument with whitespace in a command so the user can + copy-paste the outputted string directly to run the cmd + """ + + # Prettify CMD string and make it save to copy-paste by quoting arguments + quoted_cmd = ' '.join( + '"{}"'.format(arg) if ' ' in arg else arg + for arg in cmd + ) + + # Prettify error output hints string and limit to five lines + hints = hints or getattr(err, 'hints', None) + if hints: + hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') + hints = ( + ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) + for line in hints[:5] if line.strip() + ) + else: + hints = () + + output_lines = [ + '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']), + *hints, + 'Run to see full output:' + ' cd {};'.format(pwd), + ' {}'.format(quoted_cmd), + ] + + return '\n'.join( + '{}{}'.format(prefix, line) + for line in output_lines + if line + ) + +### Logging Helpers + +def log_parsing_started(source_file): + start_ts = datetime.now() + _LAST_RUN_STATS['parse_start_ts'] = start_ts + print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( + start_ts.strftime('%Y-%m-%d %H:%M:%S'), + source_file.rsplit('/', 1)[-1], + **ANSI, + )) + +def log_parsing_finished(num_new_links, parser_name): + print(' > Adding {} new links to index (parsed import as {})'.format( + num_new_links, + parser_name, + )) + +def log_indexing_started(): + start_ts = datetime.now() + _LAST_RUN_STATS['index_start_ts'] = start_ts + print('{green}[*] [{}] Saving main index files...{reset}'.format( + start_ts.strftime('%Y-%m-%d %H:%M:%S'), + **ANSI, + )) + +def log_indexing_finished(out_dir, out_file): + end_ts = datetime.now() + _LAST_RUN_STATS['index_end_ts'] = end_ts + print(' > {}/{}'.format(pretty_path(out_dir), out_file)) + +def log_archiving_started(num_links, resume): + start_ts = datetime.now() + _LAST_RUN_STATS['start_ts'] = start_ts + if resume: + print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format( + start_ts.strftime('%Y-%m-%d %H:%M:%S'), + num_links, + resume, + **ANSI, + )) + else: + print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format( + start_ts.strftime('%Y-%m-%d %H:%M:%S'), + num_links, + **ANSI, + )) + +def log_archiving_paused(num_links, idx, timestamp): + end_ts = datetime.now() + _LAST_RUN_STATS['end_ts'] = end_ts + print() + print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format( + **ANSI, + now=end_ts.strftime('%Y-%m-%d %H:%M:%S'), + idx=idx+1, + timestamp=timestamp, + total=num_links, + )) + print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) + print(' Continue where you left off by running:') + print(' {} {}'.format( + pretty_path(sys.argv[0]), + timestamp, + )) + +def log_archiving_finished(num_links): + end_ts = datetime.now() + _LAST_RUN_STATS['end_ts'] = end_ts + seconds = end_ts - _LAST_RUN_STATS['start_ts'].timestamp() + if seconds > 60: + duration = '{0:.2f} min'.format(seconds / 60, 2) + else: + duration = '{0:.2f} sec'.format(seconds, 2) + + print('{}[√] [{}] Update of {} pages complete ({}){}'.format( + ANSI['green'], + end_ts.strftime('%Y-%m-%d %H:%M:%S'), + num_links, + duration, + ANSI['reset'], + )) + print(' - {} entries skipped'.format(_LAST_RUN_STATS['skipped'])) + print(' - {} entries updated'.format(_LAST_RUN_STATS['succeded'])) + print(' - {} errors'.format(_LAST_RUN_STATS['failed'])) + print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) diff --git a/archivebox/parse.py b/archivebox/parse.py index 5549bea1..1e9a23fb 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -1,17 +1,19 @@ -# coding: utf-8 - """ -Everything related to parsing links from bookmark services. +Everything related to parsing links from input sources. For a list of supported services, see the README.md. -For examples of supported files see examples/. +For examples of supported import formats see tests/. -Parsed link schema: { +Link: { 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', - 'timestamp': '15442123124234', + 'timestamp': '1544212312.4234', 'title': 'Example.com Page Title', - 'sources': ['ril_export.html', 'downloads/getpocket.com.txt'], 'tags': 'abc,def', + 'sources': [ + 'output/sources/ril_export.html', + 'output/sources/getpocket.com-1523422111.txt', + 'output/sources/stdin-234234112312.txt' + ] } """ @@ -19,45 +21,59 @@ import re import json from datetime import datetime -from collections import OrderedDict import xml.etree.ElementTree as etree -from config import ANSI +from config import TIMEOUT from util import ( str_between, URL_REGEX, - check_url_parsing, + check_url_parsing_invariants, + progress, ) -def parse_links(path): - """parse a list of links dictionaries from a bookmark export file""" - - check_url_parsing() +def parse_links(source_file): + """parse a list of URLs with their metadata from an + RSS feed, bookmarks export, or text file + """ - links = [] - with open(path, 'r', encoding='utf-8') as file: - print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - path.rsplit('/', 1)[-1], - **ANSI, - )) + check_url_parsing_invariants() + PARSERS = ( + # Specialized parsers + ('Pocket HTML', parse_pocket_html_export), + ('Pinboard RSS', parse_pinboard_rss_export), + ('Shaarli RSS', parse_shaarli_rss_export), + ('Medium RSS', parse_medium_rss_export), + + # General parsers + ('Netscape HTML', parse_netscape_html_export), + ('Generic RSS', parse_rss_export), + ('Generic JSON', parse_json_export), - for parser_name, parser_func in PARSERS.items(): + # Fallback parser + ('Plain Text', parse_plain_text_export), + ) + end = progress(TIMEOUT * 4, prefix=' ') + with open(source_file, 'r', encoding='utf-8') as file: + for parser_name, parser_func in PARSERS: try: - links += list(parser_func(file)) + links = list(parser_func(file)) if links: - break + end() + return links, parser_name except Exception as err: - # we try each parser one by one, wong parsers will throw exeptions - # if unsupported and we accept the first one that passes - # uncomment the following line to see why the parser was unsupported for each attempted format + # Parsers are tried one by one down the list, and the first one + # that succeeds is used. To see why a certain parser was not used + # due to error or format incompatibility, uncomment this line: # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) pass - return links, parser_name + end() + return [], 'Plain Text' +### Import Parser Functions + def parse_pocket_html_export(html_file): """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" @@ -81,40 +97,57 @@ def parse_pocket_html_export(html_file): 'sources': [html_file.name], } -def parse_pinboard_json_export(json_file): + +def parse_json_export(json_file): """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" + json_file.seek(0) - json_content = json.load(json_file) - for line in json_content: + links = json.load(json_file) + json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') + + for link in links: # example line # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] - if line: - erg = line - if erg.get('timestamp'): - timestamp = str(erg['timestamp']/10000000) # chrome/ff histories use a very precise timestamp - elif erg.get('time'): - timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp()) - elif erg.get('created_at'): - timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp()) - else: - timestamp = str(datetime.now().timestamp()) - if erg.get('href'): - url = erg['href'] - else: - url = erg['url'] - if erg.get('description'): - title = (erg.get('description') or '').replace(' — Readability', '') - else: - title = erg['title'].strip() + if link: + # Parse URL + url = link.get('href') or link.get('url') or link.get('URL') + if not url: + raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') - info = { + # Parse the timestamp + ts_str = str(datetime.now().timestamp()) + if link.get('timestamp'): + # chrome/ff histories use a very precise timestamp + ts_str = str(link['timestamp'] / 10000000) + elif link.get('time'): + ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) + elif link.get('created_at'): + ts_str = str(json_date(link['created_at']).timestamp()) + elif link.get('created'): + ts_str = str(json_date(link['created']).timestamp()) + elif link.get('date'): + ts_str = str(json_date(link['date']).timestamp()) + elif link.get('bookmarked'): + ts_str = str(json_date(link['bookmarked']).timestamp()) + elif link.get('saved'): + ts_str = str(json_date(link['saved']).timestamp()) + + # Parse the title + title = None + if link.get('title'): + title = link['title'].strip() or None + elif link.get('description'): + title = link['description'].replace(' — Readability', '').strip() or None + elif link.get('name'): + title = link['name'].strip() or None + + yield { 'url': url, - 'timestamp': timestamp, - 'title': title or None, - 'tags': erg.get('tags') or '', + 'timestamp': ts_str, + 'title': title, + 'tags': link.get('tags') or '', 'sources': [json_file.name], } - yield info def parse_rss_export(rss_file): @@ -139,15 +172,15 @@ def parse_rss_export(rss_file): def get_row(key): return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] - title = str_between(get_row('title'), '', '') ts_str = str_between(get_row('pubDate'), '', '') time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") + title = str_between(get_row('title'), '$bookmarked_date   |   Last updated: $updated_date +   |   + Total files: 🗃 $num_outputs
Type: $extension   |   Tags: - $tags + $tags +   |   + Status: + $status
- Download: + Archive Methods: JSON | WARC | Media | Git Repos | Favicon | - More files... + See all files...

diff --git a/archivebox/util.py b/archivebox/util.py index 189de476..df934a80 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -8,8 +8,8 @@ from urllib.parse import urlparse, quote from decimal import Decimal from datetime import datetime from multiprocessing import Process -from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError +from stdlib_patches import run, PIPE, DEVNULL from config import ( ANSI, TERM_WIDTH, @@ -19,8 +19,6 @@ from config import ( OUTPUT_PERMISSIONS, TIMEOUT, SHOW_PROGRESS, - CHECK_SSL_VALIDITY, - WGET_USER_AGENT, CURL_BINARY, WGET_BINARY, CHROME_BINARY, @@ -37,6 +35,13 @@ from config import ( FETCH_MEDIA, SUBMIT_ARCHIVE_DOT_ORG, ARCHIVE_DIR_NAME, + RESOLUTION, + CHECK_SSL_VALIDITY, + WGET_USER_AGENT, + CHROME_USER_AGENT, + CHROME_USER_DATA_DIR, + CHROME_HEADLESS, + CHROME_SANDBOX, ) ### Parsing Helpers @@ -56,6 +61,7 @@ extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basen base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links short_ts = lambda ts: ts.split('.')[0] +urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') URL_REGEX = re.compile( r'http[s]?://' # start matching from allowed schemes @@ -109,66 +115,74 @@ def check_links_structure(links): def check_dependencies(): """Check that all necessary dependencies are installed, and have valid versions""" - python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) - if python_vers < 3.5: - print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) - print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.') + try: + python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) + if python_vers < 3.5: + print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) + print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') + raise SystemExit(1) + + if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: + if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + if FETCH_WGET or FETCH_WARC: + if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM: + if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) + print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 + try: + result = run([CHROME_BINARY, '--version'], stdout=PIPE) + version_str = result.stdout.decode('utf-8') + version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n') + version = [l for l in version_lines if l.isdigit()][-1] + if int(version) < 59: + print(version_lines) + print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + except (IndexError, TypeError, OSError): + print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + if FETCH_GIT: + if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: git{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + if FETCH_MEDIA: + if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) + print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + except (KeyboardInterrupt, Exception): raise SystemExit(1) - if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: - if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY)) - print(' See https://github.com/pirate/ArchiveBox for help.') - raise SystemExit(1) - - if FETCH_WGET or FETCH_WARC: - if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY)) - print(' See https://github.com/pirate/ArchiveBox for help.') - raise SystemExit(1) - - if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM: - if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode: - print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/ArchiveBox for help.') - raise SystemExit(1) - - # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 - try: - result = run([CHROME_BINARY, '--version'], stdout=PIPE) - version_str = result.stdout.decode('utf-8') - version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n') - version = [l for l in version_lines if l.isdigit()][-1] - if int(version) < 59: - print(version_lines) - print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI)) - print(' See https://github.com/pirate/ArchiveBox for help.') - raise SystemExit(1) - except (IndexError, TypeError, OSError): - print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/ArchiveBox for help.') - raise SystemExit(1) - - if FETCH_GIT: - if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: git{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY)) - print(' See https://github.com/pirate/ArchiveBox for help.') - raise SystemExit(1) - - if FETCH_MEDIA: - if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY)) - print(' See https://github.com/pirate/ArchiveBox for help.') - raise SystemExit(1) - -def check_url_parsing(): +def check_url_parsing_invariants(): """Check that plain text regex URL parsing works as expected""" + + # this is last-line-of-defense to make sure the URL_REGEX isn't + # misbehaving, as the consequences could be disastrous and lead to many + # incorrect/badly parsed links being added to the archive + test_urls = ''' https://example1.com/what/is/happening.html?what=1#how-about-this=1 https://example2.com/what/is/happening/?what=1#how-about-this=1 @@ -276,22 +290,9 @@ def wget_output_path(link): if link.get('latest', {}).get('wget'): return link['latest']['wget'] - urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') - if is_static_file(link['url']): return urlencode(without_scheme(without_fragment(link['url']))) - # Since the wget algorithm to for -E (appending .html) is incredibly complex - # instead of trying to emulate it here, we just look in the output folder - # to see what html file wget actually created as the output - link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) - full_path = without_fragment(without_query(path(link['url']))).strip('/') - search_dir = os.path.join( - link_dir, - domain(link['url']), - full_path, - ) - # Wget downloads can save in a number of different ways depending on the url # https://example.com # > output/archive//example.com/index.html @@ -304,6 +305,19 @@ def wget_output_path(link): # There's also lots of complexity around how the urlencoding and renaming # is done for pages with query and hash fragments or extensions like shtml / htm + + # Since the wget algorithm for -E (appending .html) is incredibly complex + # and there's no way to get the computed output path from wget + # in order to avoid having to reverse-engineer how they calculate it, + # we just look in the output folder read the filename wget used from the filesystem + link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) + full_path = without_fragment(without_query(path(link['url']))).strip('/') + search_dir = os.path.join( + link_dir, + domain(link['url']), + full_path, + ) + for _ in range(4): if os.path.exists(search_dir): if os.path.isdir(search_dir): @@ -356,47 +370,6 @@ def str_between(string, start, end=None): return content -def pretty_path(path): - """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" - return path.replace(REPO_DIR + '/', '') - - -def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '): - """quote the argument with whitespace in a command so the user can - copy-paste the outputted string directly to run the cmd - """ - - # Prettify CMD string and make it save to copy-paste by quoting arguments - quoted_cmd = ' '.join( - '"{}"'.format(arg) if ' ' in arg else arg - for arg in cmd - ) - - # Prettify error output hints string and limit to five lines - hints = hints or getattr(err, 'hints', None) - if hints: - hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') - hints = ( - ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) - for line in hints[:5] if line.strip() - ) - else: - hints = () - - output_lines = [ - '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']), - *hints, - 'Run to see full output:' - ' cd {};'.format(pwd), - ' {}'.format(quoted_cmd), - ] - - return '\n'.join( - '{}{}'.format(prefix, line) - for line in output_lines - if line - ) - ### Link Helpers @@ -571,37 +544,59 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30): print(' ', chmod_result.stderr.decode()) raise Exception('Failed to chmod {}/{}'.format(cwd, path)) -def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): - """Patched of subprocess.run to fix blocking io making timeout=innefective""" - if input is not None: - if 'stdin' in kwargs: - raise ValueError('stdin and input arguments may not both be used.') - kwargs['stdin'] = PIPE +CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR - if capture_output: - if ('stdout' in kwargs) or ('stderr' in kwargs): - raise ValueError('stdout and stderr arguments may not be used ' - 'with capture_output.') - kwargs['stdout'] = PIPE - kwargs['stderr'] = PIPE +def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, + headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, + check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, + resolution=RESOLUTION, timeout=TIMEOUT): + """helper to build up a chrome shell command with arguments""" - with Popen(*popenargs, **kwargs) as process: - try: - stdout, stderr = process.communicate(input, timeout=timeout) - except TimeoutExpired: - process.kill() - try: - stdout, stderr = process.communicate(input, timeout=2) - except: - pass - raise TimeoutExpired(popenargs[0][0], timeout) - except BaseException as err: - process.kill() - # We don't call process.wait() as .__exit__ does that for us. - raise - retcode = process.poll() - if check and retcode: - raise CalledProcessError(retcode, process.args, - output=stdout, stderr=stderr) - return CompletedProcess(process.args, retcode, stdout, stderr) + global CACHED_USER_DATA_DIR + user_data_dir = user_data_dir or CACHED_USER_DATA_DIR + cmd_args = [binary] + + if headless: + cmd_args += ('--headless',) + + if not sandbox: + # dont use GPU or sandbox when running inside docker container + cmd_args += ('--no-sandbox', '--disable-gpu') + + if not check_ssl_validity: + cmd_args += ('--disable-web-security', '--ignore-certificate-errors') + + if user_agent: + cmd_args += ('--user-agent={}'.format(user_agent),) + + if resolution: + cmd_args += ('--window-size={}'.format(RESOLUTION),) + + if timeout: + cmd_args += ('--timeout={}'.format((timeout) * 1000),) + + # Find chrome user data directory + default_profile_paths = ( + '~/.config/chromium', + '~/.config/google-chrome', + '~/.config/google-chrome-beta', + '~/.config/google-chrome-unstable', + '~/Library/Application Support/Chromium', + '~/Library/Application Support/Google/Chrome', + '~/Library/Application Support/Google/Chrome Canary', + '~/AppData/Local/Chromium/User Data', + '~/AppData/Local/Google/Chrome/User Data', + '~/AppData/Local/Google/Chrome SxS/User Data', + ) + if user_data_dir: + cmd_args.append('--user-data-dir={}'.format(user_data_dir)) + else: + for path in default_profile_paths: + full_path = os.path.expanduser(path) + if os.path.exists(full_path): + CACHED_USER_DATA_DIR = full_path + cmd_args.append('--user-data-dir={}'.format(full_path)) + break + + return cmd_args