diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 1ff03027..6fd08d0e 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -30,6 +30,7 @@ from config import ( OUTPUT_DIR, GIT_DOMAINS, GIT_SHA, + RESTRICT_FILE_NAMES, CURL_USER_AGENT, WGET_USER_AGENT, CHECK_SSL_VALIDITY, @@ -227,7 +228,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--span-hosts', '--no-parent', '-e', 'robots=off', - '--restrict-file-names=nocontrol', + *(('--restrict-file-names={}'.format(RESTRICT_FILE_NAMES),) if RESTRICT_FILE_NAMES else ()), '--timeout={}'.format(timeout), *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), diff --git a/archivebox/config.py b/archivebox/config.py index 18fe204c..f4907a30 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -34,6 +34,7 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) +RESTRICT_FILE_NAMES = os.getenv('RESTRICT_FILE_NAMES', 'windows' ) GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') CURL_USER_AGENT = os.getenv('CURL_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/)') WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 9ceeff17..a48ee8e2 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -39,6 +39,7 @@ #CHECK_SSL_VALIDITY=True #FETCH_WGET_REQUISITES=True +#RESTRICT_FILE_NAMES="windows" #RESOLUTION="1440,900" #CURL_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" #WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"