From f83750c545ecdaa908de3c243c140bac36232a1d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Jan 2019 22:38:50 -0500 Subject: [PATCH] cleanup options and make cli flags better for chrome headless timeouts --- archivebox/archive_methods.py | 15 ++++++++++----- archivebox/config.py | 23 ++++++++++++----------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 83d45520..65cac4da 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -208,10 +208,10 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT if os.path.exists(domain_dir) and existing_file: return {'output': existing_file, 'status': 'skipped'} + # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html CMD = [ - # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html 'wget', - # '--server-response', + # '--server-response', # print headers for better error parsing '--no-verbose', '--timestamping', '--adjust-extension', @@ -270,6 +270,9 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--print-to-pdf', + '--hide-scrollbars', + '--timeout=58000', + *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), link['url'] ] end = progress(timeout, prefix=' ') @@ -307,6 +310,8 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_ '--screenshot', '--window-size={}'.format(resolution), '--hide-scrollbars', + '--timeout=58000', + *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true link['url'], ] @@ -556,7 +561,7 @@ def fetch_warc(link_dir, link, timeout=TIMEOUT): os.makedirs(output, exist_ok=True) CMD = [ 'wget', - '--warc-file={}'.format(int(datetime.now().timestamp())), + '--warc-file="{}"'.format(int(datetime.now().timestamp())), *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate',))), link['url'], @@ -598,7 +603,7 @@ def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR): args.append('--no-sandbox') default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default') if user_data_dir: - args.append('--user-data-dir={}'.format(user_data_dir)) + args.append('--user-data-dir="{}"'.format(user_data_dir)) elif os.path.exists(default_profile): - args.append('--user-data-dir={}'.format(default_profile)) + args.append('--user-data-dir="{}"'.format(default_profile)) return args diff --git a/archivebox/config.py b/archivebox/config.py index 38b8b32d..61d31361 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -11,13 +11,15 @@ from subprocess import run, PIPE # ****************************************************************************** IS_TTY = sys.stdout.isatty() +ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true' USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true' SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true' -ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true' +OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' ) +MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) +TIMEOUT = int(os.getenv('TIMEOUT', '60')) + FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true' FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true' -FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true' -FETCH_VIDEO = os.getenv('FETCH_VIDEO', 'False' ).lower() == 'true' FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true' FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true' FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true' @@ -26,17 +28,16 @@ FETCH_GIT = os.getenv('FETCH_GIT', 'True' FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'False' ).lower() == 'true' FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true' -RESOLUTION = os.getenv('RESOLUTION', '1440,1200' ) -CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' -OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' ) -CHROME_BINARY = os.getenv('CHROME_BINARY', None) # change to google-chrome browser if using google-chrome -WGET_BINARY = os.getenv('WGET_BINARY', 'wget' ) + +CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'False' ).lower() == 'true' +RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) +GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox') CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) -TIMEOUT = int(os.getenv('TIMEOUT', '60')) -MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) + +CHROME_BINARY = os.getenv('CHROME_BINARY', None) # change to google-chrome browser if using google-chrome +WGET_BINARY = os.getenv('WGET_BINARY', 'wget' ) FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) -GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') ### Paths REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))