diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index b3915e2f..56009cd1 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -33,6 +33,7 @@ from config import ( WGET_USER_AGENT, CHECK_SSL_VALIDITY, COOKIES_FILE, + WGET_AUTO_COMPRESSION ) from util import ( domain, @@ -224,10 +225,10 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--backup-converted', '--span-hosts', '--no-parent', - '--compression=auto', '-e', 'robots=off', '--restrict-file-names=unix', '--timeout={}'.format(timeout), + *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()), *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), diff --git a/archivebox/config.py b/archivebox/config.py index 7235e7ca..ec970a22 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -74,6 +74,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC +WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode) ########################### Environment & Dependencies ######################### diff --git a/archivebox/parse.py b/archivebox/parse.py index ce6b0358..baaa447e 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -154,7 +154,8 @@ def parse_rss_export(rss_file): """Parse RSS XML-format files into links""" rss_file.seek(0) - items = rss_file.read().split('\n') + items = rss_file.read().split('') + items = items[1:] if items else [] for item in items: # example item: # @@ -166,7 +167,7 @@ def parse_rss_export(rss_file): # trailing_removed = item.split('', 1)[0] - leading_removed = trailing_removed.split('', 1)[-1] + leading_removed = trailing_removed.split('', 1)[-1].strip() rows = leading_removed.split('\n') def get_row(key):