From 7ea36c4adb1508e134caedcc5c86bb03a129f87c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 20 Jan 2019 12:34:15 -0500 Subject: [PATCH] bump timeouts and improve curl archive method --- archivebox/archive_methods.py | 24 +++++++++++++++++------- archivebox/util.py | 2 +- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 901d3aaa..d6d32542 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -214,6 +214,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC '--span-hosts', '--no-parent', '--restrict-file-names=unix', + f'--timeout={timeout}', *(('--warc-file={}'.format(warc_path),) if warc else ()), *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), @@ -222,7 +223,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC ] end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # index.html end() output = wget_output_path(link, look_in=domain_dir) @@ -265,13 +266,13 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI *chrome_headless(user_data_dir=user_data_dir), '--print-to-pdf', '--hide-scrollbars', - '--timeout=58000', + '--timeout={timeout * 1000}', *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), link['url'] ] end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.pdf end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) @@ -304,14 +305,14 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_ '--screenshot', '--window-size={}'.format(resolution), '--hide-scrollbars', - '--timeout=58000', + '--timeout={timeout * 1000}', *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true link['url'], ] end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # sreenshot.png end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) @@ -344,12 +345,13 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--dump-dom', + '--timeout={timeout * 1000}', link['url'] ] end = progress(timeout, prefix=' ') try: with open(output_path, 'w+') as f: - result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html + result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.html end() if result.returncode: print(' ', (result.stderr).decode()) @@ -379,7 +381,15 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): submit_url = 'https://web.archive.org/save/{}'.format(link['url']) success = False - CMD = ['curl', '-L', '-I', '-X', 'GET', submit_url] + CMD = [ + 'curl', + '--location', + '--head', + '--max-time', str(timeout), + '--get', + *(() if CHECK_SSL_VALIDITY else ('--insecure',)), + submit_url, + ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt diff --git a/archivebox/util.py b/archivebox/util.py index e0df3fdd..c6384a24 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -123,7 +123,7 @@ def progress(seconds=TIMEOUT, prefix=''): chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) - def progress_bar(seconds=seconds, prefix=prefix): + def progress_bar(seconds, prefix): """show timer in the form of progress bar, with percentage and seconds remaining""" try: for s in range(seconds * chunks):