diff --git a/archive_methods.py b/archive_methods.py index c3a44b62..356014f9 100644 --- a/archive_methods.py +++ b/archive_methods.py @@ -7,7 +7,6 @@ from subprocess import run, PIPE, DEVNULL from index import html_appended_url, parse_json_link_index, write_link_index from links import links_after_timestamp from config import ( - ARCHIVE_PERMISSIONS, ARCHIVE_DIR, CHROME_BINARY, FETCH_WGET, @@ -29,26 +28,90 @@ from util import ( chmod_file, ) -_RESULTS_TOTALS = { + +_RESULTS_TOTALS = { # globals are bad, mmkay 'skipped': 0, 'succeded': 0, 'failed': 0, } + +def archive_links(out_dir, links, export_path, resume=None): + check_dependencies() + + to_archive = links_after_timestamp(links, resume) + try: + for idx, link in enumerate(to_archive): + out_dir = os.path.join(out_dir, link['timestamp']) + archive_link(out_dir, link) + + except (KeyboardInterrupt, SystemExit, Exception) as e: + print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format( + **ANSI, + idx=idx, + total=len(list(to_archive)), + )) + print(' Continue where you left off by running:') + print(' ./archive.py {} {}'.format( + export_path, + link['timestamp'], + )) + if not isinstance(e, KeyboardInterrupt): + raise e + raise SystemExit(1) + + +def archive_link(out_dir, link, overwrite=False): + """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + + link = {**parse_json_link_index(out_dir), **link} + log_link_archive(out_dir, link) + + if FETCH_WGET: + link = fetch_wget(out_dir, link, overwrite=overwrite) + + if FETCH_PDF: + link = fetch_pdf(out_dir, link, overwrite=overwrite) + + if FETCH_SCREENSHOT: + link = fetch_screenshot(out_dir, link, overwrite=overwrite) + + if SUBMIT_ARCHIVE_DOT_ORG: + link = archive_dot_org(out_dir, link, overwrite=overwrite) + + # if FETCH_AUDIO: + # link = fetch_audio(out_dir, link, overwrite=overwrite) + + # if FETCH_VIDEO: + # link = fetch_video(out_dir, link, overwrite=overwrite) + + if FETCH_FAVICON: + link = fetch_favicon(out_dir, link, overwrite=overwrite) + + write_link_index(out_dir, link) + + return link + + def attach_result_to_link(method): + """ + Instead of returning a result={output:'...', status:'success'} object, + attach that result to the links's history & latest fields, then return + the updated link object. + """ def decorator(fetch_func): @wraps(fetch_func) def timed_fetch_func(out_dir, link, overwrite=False, **kwargs): # initialize methods and history json field on link - link['methods'] = link.get('methods') or {} - link['methods'][method] = link['methods'].get(method) or None + link['latest'] = link.get('latest') or {} + link['latest'][method] = link['latest'].get(method) or None link['history'] = link.get('history') or {} link['history'][method] = link['history'].get(method) or [] start_ts = datetime.now().timestamp() # if a valid method output is already present, dont run the fetch function - if link['methods'][method] and not overwrite: + if link['latest'][method] and not overwrite: print(' √ Skipping: {}'.format(method)) result = None else: @@ -74,7 +137,7 @@ def attach_result_to_link(method): history_entry['duration'] = duration history_entry.update(result or {}) link['history'][method].append(history_entry) - link['methods'][method] = result['output'] + link['latest'][method] = result['output'] _RESULTS_TOTALS[history_entry['status']] += 1 @@ -105,7 +168,6 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT) print(' got wget response code {}:'.format(result.returncode)) print('\n'.join(' ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip())) # raise Exception('Failed to wget download') - chmod_file(link['domain'], cwd=out_dir) except Exception as e: end() print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) @@ -140,7 +202,6 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT): if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to print PDF') - chmod_file('output.pdf', cwd=out_dir) output = 'output.pdf' except Exception as e: end() @@ -338,67 +399,11 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT): # print(' √ Skipping video download') -def archive_links(out_dir, links, export_path, resume=None): - check_dependencies() - - to_archive = links_after_timestamp(links, resume) - try: - for idx, link in enumerate(to_archive): - out_dir = os.path.join(out_dir, link['timestamp']) - archive_link(out_dir, link) - - except (KeyboardInterrupt, SystemExit, Exception) as e: - print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format( - **ANSI, - idx=idx, - total=len(list(to_archive)), - )) - print(' Continue where you left off by running:') - print(' ./archive.py {} {}'.format( - export_path, - link['timestamp'], - )) - if not isinstance(e, KeyboardInterrupt): - raise e - raise SystemExit(1) - - -def archive_link(out_dir, link, overwrite=False, permissions=ARCHIVE_PERMISSIONS): - """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - - link = {**parse_json_link_index(out_dir), **link} - log_link_archive(out_dir, link) - - if FETCH_WGET: - link = fetch_wget(out_dir, link, overwrite=overwrite) - - if FETCH_PDF: - link = fetch_pdf(out_dir, link, overwrite=overwrite) - - if FETCH_SCREENSHOT: - link = fetch_screenshot(out_dir, link, overwrite=overwrite) - - if SUBMIT_ARCHIVE_DOT_ORG: - link = archive_dot_org(out_dir, link, overwrite=overwrite) - - # if FETCH_AUDIO: - # link = fetch_audio(out_dir, link, overwrite=overwrite) - - # if FETCH_VIDEO: - # link = fetch_video(out_dir, link, overwrite=overwrite) - - if FETCH_FAVICON: - link = fetch_favicon(out_dir, link, overwrite=overwrite) - - write_link_index(out_dir, link) - - return link def log_link_archive(out_dir, link): update_existing = os.path.exists(out_dir) if not update_existing: os.makedirs(out_dir) - run(['chmod', ARCHIVE_PERMISSIONS, out_dir], timeout=5) print('[{symbol_color}{symbol}{reset}] [{timestamp}] "{title}": {blue}{base_url}{reset}'.format( symbol='*' if update_existing else '+', diff --git a/config.py b/config.py index bca95d4b..bdd07c0b 100644 --- a/config.py +++ b/config.py @@ -4,14 +4,11 @@ import shutil from subprocess import run, PIPE -# os.getenv('VARIABLE', 'DEFAULT') gets the value of environment -# variable "VARIABLE" and if it is not set, sets it to 'DEFAULT' - -# for boolean values, check to see if the string is 'true', and -# if so, the python variable will be True - -# ******************************************************************************* -# *** TO SET YOUR PREFERENCES, EDIT THE VALUES HERE, or use the 'env' command *** +# ****************************************************************************** +# * TO SET YOUR CONFIGURATION, EDIT THE VALUES BELOW, or use the 'env' command * +# * e.g. * +# * env USE_COLOR=True CHROME_BINARY=google-chrome ./archive.py export.html * +# ****************************************************************************** IS_TTY = sys.stdout.isatty() USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true' @@ -35,8 +32,16 @@ LINK_INDEX_TEMPLATE = os.getenv('LINK_INDEX_TEMPLATE', 'templates/link_ind INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html') INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_row.html') -# ******************************************************************************* +### Output Paths +ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html') +ARCHIVE_FOLDER = os.path.join(HTML_FOLDER, 'archive') +# ****************************************************************************** +# ********************** Do not edit below this point ************************** +# ****************************************************************************** + +### Terminal Configuration TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns ANSI = { 'reset': '\033[00;00m', @@ -53,17 +58,13 @@ if not USE_COLOR: # dont show colors if USE_COLOR is False ANSI = {k: '' for k in ANSI.keys()} - -ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__)) -HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html') -ARCHIVE_FOLDER = os.path.join(HTML_FOLDER, 'archive') +### Confirm Environment Setup try: GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=ROOT_FOLDER).stdout.strip().decode() except Exception: GIT_SHA = None print('[!] Warning, you need git installed for some archiving features to save correct version numbers!') - if sys.stdout.encoding.upper() != 'UTF-8': print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding)) print(' To fix it, add the line "export PYTHONIOENCODING=utf8" to your ~/.bashrc file (without quotes)')