From d6892643654cc02160d48b9f12f07a688c68402b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 21 Feb 2019 15:47:15 -0500 Subject: [PATCH] add new config and dependency options --- archivebox/archive_methods.py | 30 ++++++++++---- archivebox/config.py | 19 +++++---- archivebox/util.py | 75 +++++++++++++++++++++++------------ 3 files changed, 83 insertions(+), 41 deletions(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index c7924318..8f466efa 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -11,6 +11,10 @@ from peekable import Peekable from index import wget_output_path, parse_json_link_index, write_link_index from links import links_after_timestamp from config import ( + CURL_BINARY, + GIT_BINARY, + WGET_BINARY, + YOUTUBEDL_BINARY, CHROME_BINARY, FETCH_FAVICON, FETCH_TITLE, @@ -37,6 +41,7 @@ from config import ( GIT_SHA, ) from util import ( + without_hash, check_dependencies, fetch_page_title, progress, @@ -214,7 +219,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html CMD = [ - 'wget', + WGET_BINARY, # '--server-response', # print headers for better error parsing '--no-verbose', '--adjust-extension', @@ -417,7 +422,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): success = False CMD = [ - 'curl', + CURL_BINARY, '--location', '--head', '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), @@ -481,8 +486,9 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT): return {'output': 'favicon.ico', 'status': 'skipped'} CMD = [ - 'curl', + CURL_BINARY, '--max-time', str(timeout), + *(() if CHECK_SSL_VALIDITY else ('--insecure',)), 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), ] fout = open('{}/favicon.ico'.format(link_dir), 'w') @@ -542,7 +548,7 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): os.makedirs(output, exist_ok=True) CMD = [ - 'youtube-dl', + YOUTUBEDL_BINARY, '--write-description', '--write-info-json', '--write-annotations', @@ -552,12 +558,15 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): '--no-check-certificate', '--user-agent', '--all-subs', - '-x', - '-k', + '--extract-audio', + '--keep-video', + '--ignore-errors', + '--geo-bypass', '--audio-format', 'mp3', '--audio-quality', '320K', '--embed-thumbnail', '--add-metadata', + *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)), link['url'], ] @@ -605,7 +614,14 @@ def fetch_git(link_dir, link, timeout=TIMEOUT): if os.path.exists(os.path.join(link_dir, 'git')): return {'output': 'git', 'status': 'skipped'} - CMD = ['git', 'clone', '--mirror', '--recursive', link['url'].split('#')[0], 'git'] + CMD = [ + GIT_BINARY, + *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')), + 'clone', + '--mirror', + '--recursive', + without_hash(link['url']), + ] output = 'git' end = progress(timeout, prefix=' ') diff --git a/archivebox/config.py b/archivebox/config.py index fec3411e..d1e8488b 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -11,12 +11,13 @@ from subprocess import run, PIPE # ****************************************************************************** IS_TTY = sys.stdout.isatty() -ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true' USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true' SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true' -OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' ) +ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true' MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) TIMEOUT = int(os.getenv('TIMEOUT', '60')) +OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' ) +FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true' FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true' @@ -33,13 +34,15 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') -COOKIES_FILE = os.getenv('COOKIES_FILE', None) WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') +COOKIES_FILE = os.getenv('COOKIES_FILE', None) CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) -CHROME_BINARY = os.getenv('CHROME_BINARY', None) # change to google-chrome browser if using google-chrome -WGET_BINARY = os.getenv('WGET_BINARY', 'wget' ) -FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) +CURL_BINARY = os.getenv('CURL_BINARY', 'curl') +GIT_BINARY = os.getenv('GIT_BINARY', 'git') +WGET_BINARY = os.getenv('WGET_BINARY', 'wget') +YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') +CHROME_BINARY = os.getenv('CHROME_BINARY', None) ### Paths REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) @@ -101,7 +104,7 @@ if not USE_COLOR: ### Confirm Environment Setup GIT_SHA = 'unknown' try: - GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() + GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() except Exception: print('[!] Warning: unable to determine git version, is git installed and in your $PATH?') @@ -115,7 +118,7 @@ except Exception: WGET_VERSION = 'unknown' try: - wget_vers_str = run(["wget", "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() + wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2] except Exception: if USE_WGET: diff --git a/archivebox/util.py b/archivebox/util.py index 576816dd..6b40b572 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -14,23 +14,30 @@ from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, C from multiprocessing import Process from config import ( - IS_TTY, - OUTPUT_PERMISSIONS, - REPO_DIR, - SOURCES_DIR, - OUTPUT_DIR, - ARCHIVE_DIR, - TIMEOUT, - TERM_WIDTH, - SHOW_PROGRESS, ANSI, + IS_TTY, + TERM_WIDTH, + REPO_DIR, + OUTPUT_DIR, + SOURCES_DIR, + ARCHIVE_DIR, + OUTPUT_PERMISSIONS, + TIMEOUT, + SHOW_PROGRESS, + CHECK_SSL_VALIDITY, + CURL_BINARY, + WGET_BINARY, CHROME_BINARY, + GIT_BINARY, + YOUTUBEDL_BINARY, + FETCH_TITLE, + FETCH_FAVICON, FETCH_WGET, + FETCH_WARC, FETCH_PDF, FETCH_SCREENSHOT, FETCH_DOM, - FETCH_FAVICON, - FETCH_TITLE, + FETCH_GIT, FETCH_MEDIA, SUBMIT_ARCHIVE_DOT_ORG, ) @@ -64,6 +71,20 @@ def check_dependencies(): print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.') raise SystemExit(1) + if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: + if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY)) + print(' See https://github.com/pirate/ArchiveBox for help.') + raise SystemExit(1) + + if FETCH_WGET or FETCH_WARC: + if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY)) + print(' See https://github.com/pirate/ArchiveBox for help.') + raise SystemExit(1) + if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM: if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode: print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) @@ -88,24 +109,17 @@ def check_dependencies(): print(' See https://github.com/pirate/ArchiveBox for help.') raise SystemExit(1) - if FETCH_WGET: - if run(['which', 'wget'], stdout=DEVNULL).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget')) - print(' See https://github.com/pirate/ArchiveBox for help.') - raise SystemExit(1) - - if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: - if run(['which', 'curl'], stdout=DEVNULL).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl')) + if FETCH_GIT: + if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: git{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY)) print(' See https://github.com/pirate/ArchiveBox for help.') raise SystemExit(1) if FETCH_MEDIA: - if run(['which', 'youtube-dl'], stdout=DEVNULL).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode: + if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode: print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl')) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY)) print(' See https://github.com/pirate/ArchiveBox for help.') raise SystemExit(1) @@ -246,8 +260,17 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): sys.stdout.write('.') sys.stdout.flush() - html_content = urlopen(url, timeout=timeout).read().decode('utf-8') - match = re.search(HTML_TITLE_REGEX, html_content) + if CHECK_SSL_VALIDITY: + html_content = urlopen(url, timeout=timeout) + else: + try: + import ssl + insecure = ssl._create_unverified_context() + html_content = urlopen(url, timeout=timeout, context=insecure) + except ImportError: + html_content = urlopen(url, timeout=timeout) + + match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8')) return match.group(1).strip() if match else None except Exception: return None