diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index b2f04f33..6fd08d0e 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -30,6 +30,8 @@ from config import ( OUTPUT_DIR, GIT_DOMAINS, GIT_SHA, + RESTRICT_FILE_NAMES, + CURL_USER_AGENT, WGET_USER_AGENT, CHECK_SSL_VALIDITY, COOKIES_FILE, @@ -226,7 +228,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--span-hosts', '--no-parent', '-e', 'robots=off', - '--restrict-file-names=windows', + *(('--restrict-file-names={}'.format(RESTRICT_FILE_NAMES),) if RESTRICT_FILE_NAMES else ()), '--timeout={}'.format(timeout), *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), @@ -561,7 +563,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): CURL_BINARY, '--location', '--head', - '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from + *(('--user-agent', '{}'.format(CURL_USER_AGENT),) if CURL_USER_AGENT else ()), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from '--max-time', str(timeout), *(() if CHECK_SSL_VALIDITY else ('--insecure',)), submit_url, diff --git a/archivebox/config.py b/archivebox/config.py index 47f1776f..f4907a30 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -34,7 +34,9 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) +RESTRICT_FILE_NAMES = os.getenv('RESTRICT_FILE_NAMES', 'windows' ) GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') +CURL_USER_AGENT = os.getenv('CURL_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/)') WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') COOKIES_FILE = os.getenv('COOKIES_FILE', None) CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) @@ -192,13 +194,15 @@ try: raise ### Make sure curl is installed - if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: + if FETCH_FAVICON or FETCH_TITLE or SUBMIT_ARCHIVE_DOT_ORG: if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY)) print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') raise SystemExit(1) + CURL_USER_AGENT = CURL_USER_AGENT.format(GIT_SHA=GIT_SHA[:9]) + ### Make sure wget is installed and calculate version if FETCH_WGET or FETCH_WARC: if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index dcb8aeac..a48ee8e2 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -39,7 +39,9 @@ #CHECK_SSL_VALIDITY=True #FETCH_WGET_REQUISITES=True +#RESTRICT_FILE_NAMES="windows" #RESOLUTION="1440,900" +#CURL_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #CHROME_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #GIT_DOMAINS="github.com,bitbucket.org,gitlab.com"