From 58c9b47d433b5ef68d5fd8fa510e2bd37aff60ba Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 25 Mar 2019 16:27:50 -0400 Subject: [PATCH 1/5] fix rss parsing when items have newlines between them --- archivebox/parse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivebox/parse.py b/archivebox/parse.py index ce6b0358..baaa447e 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -154,7 +154,8 @@ def parse_rss_export(rss_file): """Parse RSS XML-format files into links""" rss_file.seek(0) - items = rss_file.read().split('\n') + items = rss_file.read().split('') + items = items[1:] if items else [] for item in items: # example item: # @@ -166,7 +167,7 @@ def parse_rss_export(rss_file): # trailing_removed = item.split('', 1)[0] - leading_removed = trailing_removed.split('', 1)[-1] + leading_removed = trailing_removed.split('', 1)[-1].strip() rows = leading_removed.split('\n') def get_row(key): From ed7ad04fd9406f8b7df27b5fb356d09309c3917a Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Tue, 26 Mar 2019 15:46:57 +0000 Subject: [PATCH 2/5] wget: Remove unsupported compression option --compression=auto is not supported on Raspberry Pis. This PR removes it from ArchiveBox. --- archivebox/archive_methods.py | 1 - 1 file changed, 1 deletion(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index b3915e2f..bf2eec4e 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -224,7 +224,6 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--backup-converted', '--span-hosts', '--no-parent', - '--compression=auto', '-e', 'robots=off', '--restrict-file-names=unix', '--timeout={}'.format(timeout), From 51859731d382642f4520cbdeaa04a47308537f5c Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Tue, 26 Mar 2019 23:34:45 +0000 Subject: [PATCH 3/5] config.py: add WGET_AUTO_COMPRESSION --- archivebox/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/config.py b/archivebox/config.py index d8e01b24..26f67b1a 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -72,6 +72,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC +WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode ########################### Environment & Dependencies ######################### From ab72a2dad10d2379023940d3e7bd4a19108206d7 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Tue, 26 Mar 2019 23:38:50 +0000 Subject: [PATCH 4/5] Try to add --compression back again --- archivebox/archive_methods.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index bf2eec4e..56009cd1 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -33,6 +33,7 @@ from config import ( WGET_USER_AGENT, CHECK_SSL_VALIDITY, COOKIES_FILE, + WGET_AUTO_COMPRESSION ) from util import ( domain, @@ -227,6 +228,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '-e', 'robots=off', '--restrict-file-names=unix', '--timeout={}'.format(timeout), + *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()), *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), From 6bec1709f6e5f936695a81bc7db688981ef90496 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 27 Mar 2019 04:48:38 -0400 Subject: [PATCH 5/5] Update config.py --- archivebox/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index 26f67b1a..29ed2df2 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -72,7 +72,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC -WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode +WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode) ########################### Environment & Dependencies #########################