From 4e69d2c9e14bbbc4597731fdc349f5461a726b54 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Wed, 21 Feb 2024 15:13:06 -0600 Subject: [PATCH 1/6] Add `EXTRA_*_ARGS` for wget, curl, and singlefile --- archivebox/config.py | 8 +++++++- archivebox/extractors/archive_org.py | 13 ++++++++++--- archivebox/extractors/favicon.py | 18 ++++++++++++++---- archivebox/extractors/headers.py | 14 ++++++++++---- archivebox/extractors/singlefile.py | 25 +++++++++---------------- archivebox/extractors/title.py | 13 ++++++++++--- archivebox/extractors/wget.py | 15 +++++++++++---- archivebox/util.py | 17 +++++++++++++++++ 8 files changed, 88 insertions(+), 35 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 1edd2eeb..ebb939a4 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -187,12 +187,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--no-parent', '-e', 'robots=off', ]}, + 'WGET_EXTRA_ARGS': {'type': list, 'default': None}, 'CURL_ARGS': {'type': list, 'default': ['--silent', '--location', '--compressed' ]}, + 'CURL_EXTRA_ARGS': {'type': list, 'default': None}, 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, - 'SINGLEFILE_ARGS': {'type': list, 'default' : None}, + 'SINGLEFILE_ARGS': {'type': list, 'default': None}, + 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None}, 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, }, @@ -530,6 +533,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []}, + 'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []}, 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']}, @@ -540,12 +544,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, + 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []}, 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, 'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []}, + 'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []}, 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index a0883113..93730f26 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -10,10 +10,12 @@ from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, + dedupe, ) from ..config import ( TIMEOUT, CURL_ARGS, + CURL_EXTRA_ARGS, CHECK_SSL_VALIDITY, SAVE_ARCHIVE_DOT_ORG, CURL_BINARY, @@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= output: ArchiveOutput = 'archive.org.txt' archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) - cmd = [ - CURL_BINARY, - *CURL_ARGS, + # earlier options take precedence + options = [ '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *CURL_EXTRA_ARGS, + *CURL_ARGS, + ] + cmd = [ + CURL_BINARY, + *dedupe(*options), submit_url, ] status = 'succeeded' diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 5baafc17..3b41f349 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -6,13 +6,18 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..system import chmod_file, run -from ..util import enforce_types, domain +from ..util import ( + enforce_types, + domain, + dedupe, +) from ..config import ( TIMEOUT, SAVE_FAVICON, FAVICON_PROVIDER, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_VERSION, CHECK_SSL_VALIDITY, CURL_USER_AGENT, @@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) out_dir = out_dir or link.link_dir output: ArchiveOutput = 'favicon.ico' - cmd = [ - CURL_BINARY, - *CURL_ARGS, + # earlier options take precedence + options = [ '--max-time', str(timeout), '--output', str(output), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *CURL_EXTRA_ARGS, + *CURL_ARGS, + ] + cmd = [ + CURL_BINARY, + *dedupe(*options), FAVICON_PROVIDER.format(domain(link.url)), ] status = 'failed' diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 91dcb8e3..3828de93 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -9,11 +9,13 @@ from ..system import atomic_write from ..util import ( enforce_types, get_headers, + dedupe, ) from ..config import ( TIMEOUT, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_USER_AGENT, CURL_VERSION, CHECK_SSL_VALIDITY, @@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') - - cmd = [ - CURL_BINARY, - *CURL_ARGS, + # earlier options take precedence + options = [ '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *CURL_EXTRA_ARGS, + *CURL_ARGS, + ] + cmd = [ + CURL_BINARY, + *dedupe(*options), link.url, ] try: diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index e50b3932..b2119119 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -11,6 +11,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + dedupe, ) from ..config import ( TIMEOUT, @@ -18,6 +19,7 @@ from ..config import ( DEPENDENCIES, SINGLEFILE_VERSION, SINGLEFILE_ARGS, + SINGLEFILE_EXTRA_ARGS, CHROME_BINARY, ) from ..logging_util import TimedProgress @@ -46,11 +48,6 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - options = [ - *SINGLEFILE_ARGS, - '--browser-executable-path={}'.format(CHROME_BINARY), - browser_args, - ] # Deduplicate options (single-file doesn't like when you use the same option two times) # @@ -58,19 +55,15 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most # specificity, therefore the user sets it with a lot intent, therefore it should take precedence # kind of like the ergonomic principle of lexical scope in programming languages. - seen_option_names = [] - def test_seen(argument): - option_name = argument.split("=")[0] - if option_name in seen_option_names: - return False - else: - seen_option_names.append(option_name) - return True - deduped_options = list(filter(test_seen, options)) - + options = [ + '--browser-executable-path={}'.format(CHROME_BINARY), + browser_args, + *SINGLEFILE_EXTRA_ARGS, + *SINGLEFILE_ARGS, + ] cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], - *deduped_options, + *dedupe(*options), link.url, output, ] diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 6b0e37f6..b2b65af2 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -10,6 +10,7 @@ from ..util import ( enforce_types, download_url, htmldecode, + dedupe, ) from ..config import ( TIMEOUT, @@ -17,6 +18,7 @@ from ..config import ( SAVE_TITLE, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_VERSION, CURL_USER_AGENT, ) @@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - from core.models import Snapshot output: ArchiveOutput = None - cmd = [ - CURL_BINARY, - *CURL_ARGS, + # earlier options take precedence + options = [ '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *CURL_EXTRA_ARGS, + *CURL_ARGS, + ] + cmd = [ + CURL_BINARY, + *dedupe(*options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index f3057271..d50409b6 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -15,9 +15,11 @@ from ..util import ( path, domain, urldecode, + dedupe, ) from ..config import ( WGET_ARGS, + WGET_EXTRA_ARGS, TIMEOUT, SAVE_WGET, SAVE_WARC, @@ -55,10 +57,8 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html output: ArchiveOutput = None - cmd = [ - WGET_BINARY, - # '--server-response', # print headers for better error parsing - *WGET_ARGS, + # earlier options take precedence + options = [ '--timeout={}'.format(timeout), *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), @@ -68,6 +68,13 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), + # '--server-response', # print headers for better error parsing + *WGET_EXTRA_ARGS, + *WGET_ARGS, + ] + cmd = [ + WGET_BINARY, + *dedupe(*options), link.url, ] diff --git a/archivebox/util.py b/archivebox/util.py index 5321081c..6b31c86e 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -317,6 +317,23 @@ def ansi_to_html(text): return COLOR_REGEX.sub(single_sub, text) +@enforce_types +def dedupe(*options: List[str]) -> List[str]: + """ + Deduplicates the given options. Options that come earlier in the list clobber + later conflicting options. + """ + seen_option_names = [] + def test_seen(argument): + option_name = argument.split("=")[0] + if option_name in seen_option_names: + return False + else: + seen_option_names.append(option_name) + return True + return list(filter(test_seen, options)) + + class AttributeDict(dict): """Helper to allow accessing dict values via Example.key or Example['key']""" From ab8f395e0a4104dd01385be3d8fcea082a6987ee Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Fri, 23 Feb 2024 15:40:31 -0600 Subject: [PATCH 2/6] Add `YOUTUBEDL_EXTRA_ARGS` --- archivebox/config.py | 1 + archivebox/extractors/media.py | 12 +++++++++--- archivebox/extractors/wget.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index ebb939a4..00e3b9f0 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -176,6 +176,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--add-metadata', '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']), ]}, + 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None}, 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 7d73024f..862bb758 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -8,11 +8,13 @@ from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, + dedupe, ) from ..config import ( MEDIA_TIMEOUT, SAVE_MEDIA, YOUTUBEDL_ARGS, + YOUTUBEDL_EXTRA_ARGS, YOUTUBEDL_BINARY, YOUTUBEDL_VERSION, CHECK_SSL_VALIDITY @@ -39,11 +41,15 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME output: ArchiveOutput = 'media' output_path = out_dir / output output_path.mkdir(exist_ok=True) - cmd = [ - YOUTUBEDL_BINARY, - *YOUTUBEDL_ARGS, + options = [ *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} + *YOUTUBEDL_EXTRA_ARGS, + *YOUTUBEDL_ARGS, + ] + cmd = [ + YOUTUBEDL_BINARY, + *dedupe(*options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index d50409b6..5209cde9 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -69,7 +69,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), # '--server-response', # print headers for better error parsing - *WGET_EXTRA_ARGS, + *WGET_EXTRA_ARGS, *WGET_ARGS, ] cmd = [ From 4d9c5a7b4b0bc0f490b6d8928878853fad363d16 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Fri, 23 Feb 2024 18:40:03 -0600 Subject: [PATCH 3/6] Add `CHROME_EXTRA_ARGS` Also fix `YOUTUBEDL_EXTRA_ARGS`. --- archivebox/config.py | 4 ++++ archivebox/util.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 00e3b9f0..f8e56036 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -152,6 +152,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CHROME_TIMEOUT': {'type': int, 'default': 0}, 'CHROME_HEADLESS': {'type': bool, 'default': True}, 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, + 'CHROME_EXTRA_ARGS': {'type': list, 'default': None}, + 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ '--restrict-filenames', '--trim-filenames', '128', @@ -568,6 +570,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None}, 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']}, 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []}, + 'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []}, 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()}, 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, @@ -589,6 +592,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, + 'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []}, 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, } diff --git a/archivebox/util.py b/archivebox/util.py index 6b31c86e..18ca08aa 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -227,7 +227,11 @@ def chrome_args(**options) -> List[str]: # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/ - from .config import CHROME_OPTIONS, CHROME_VERSION + from .config import ( + CHROME_OPTIONS, + CHROME_VERSION, + CHROME_EXTRA_ARGS, + ) options = {**CHROME_OPTIONS, **options} @@ -279,8 +283,10 @@ def chrome_args(**options) -> List[str]: if options['CHROME_USER_DATA_DIR']: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - - return cmd_args + + cmd_args += CHROME_EXTRA_ARGS + + return dedupe(*cmd_args) def chrome_cleanup(): """ From d74ddd42ae104004e656929036c55f972a9d63d4 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Fri, 1 Mar 2024 14:50:32 -0600 Subject: [PATCH 4/6] Flip dedupe precedence order --- archivebox/extractors/archive_org.py | 6 +++--- archivebox/extractors/favicon.py | 6 +++--- archivebox/extractors/headers.py | 6 +++--- archivebox/extractors/media.py | 5 +++-- archivebox/extractors/singlefile.py | 14 ++++---------- archivebox/extractors/title.py | 6 +++--- archivebox/extractors/wget.py | 6 +++--- archivebox/util.py | 24 +++++++++++------------- 8 files changed, 33 insertions(+), 40 deletions(-) diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 93730f26..0d45534a 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -46,14 +46,14 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= output: ArchiveOutput = 'archive.org.txt' archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 3b41f349..fffa3d16 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -39,14 +39,14 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) out_dir = out_dir or link.link_dir output: ArchiveOutput = 'favicon.ico' - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--max-time', str(timeout), '--output', str(output), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 3828de93..9be14331 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -42,14 +42,14 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 862bb758..a6d4e81f 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -41,11 +41,12 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME output: ArchiveOutput = 'media' output_path = out_dir / output output_path.mkdir(exist_ok=True) + # later options take precedence options = [ + *YOUTUBEDL_ARGS, + *YOUTUBEDL_EXTRA_ARGS, *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} - *YOUTUBEDL_EXTRA_ARGS, - *YOUTUBEDL_ARGS, ] cmd = [ YOUTUBEDL_BINARY, diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index b2119119..5021a6cc 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -48,18 +48,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - - # Deduplicate options (single-file doesn't like when you use the same option two times) - # - # NOTE: Options names that come first clobber conflicting names that come later - # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most - # specificity, therefore the user sets it with a lot intent, therefore it should take precedence - # kind of like the ergonomic principle of lexical scope in programming languages. + # later options take precedence options = [ - '--browser-executable-path={}'.format(CHROME_BINARY), - browser_args, - *SINGLEFILE_EXTRA_ARGS, *SINGLEFILE_ARGS, + *SINGLEFILE_EXTRA_ARGS, + browser_args, + '--browser-executable-path={}'.format(CHROME_BINARY), ] cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index b2b65af2..4f34ca81 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -104,13 +104,13 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - from core.models import Snapshot output: ArchiveOutput = None - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 5209cde9..885e31f5 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -57,8 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html output: ArchiveOutput = None - # earlier options take precedence + # later options take precedence options = [ + *WGET_ARGS, + *WGET_EXTRA_ARGS, '--timeout={}'.format(timeout), *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), @@ -69,8 +71,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), # '--server-response', # print headers for better error parsing - *WGET_EXTRA_ARGS, - *WGET_ARGS, ] cmd = [ WGET_BINARY, diff --git a/archivebox/util.py b/archivebox/util.py index 18ca08aa..10ceebd4 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -240,6 +240,8 @@ def chrome_args(**options) -> List[str]: cmd_args = [options['CHROME_BINARY']] + cmd_args += CHROME_EXTRA_ARGS + if options['CHROME_HEADLESS']: chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1]) if chrome_major_version >= 111: @@ -284,7 +286,6 @@ def chrome_args(**options) -> List[str]: if options['CHROME_USER_DATA_DIR']: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - cmd_args += CHROME_EXTRA_ARGS return dedupe(*cmd_args) @@ -324,20 +325,17 @@ def ansi_to_html(text): @enforce_types -def dedupe(*options: List[str]) -> List[str]: +def dedupe(*options: str) -> List[str]: """ - Deduplicates the given options. Options that come earlier in the list clobber - later conflicting options. + Deduplicates the given options. Options that come later clobber earlier + conflicting options. """ - seen_option_names = [] - def test_seen(argument): - option_name = argument.split("=")[0] - if option_name in seen_option_names: - return False - else: - seen_option_names.append(option_name) - return True - return list(filter(test_seen, options)) + deduped = {} + + for option in options: + deduped[option.split('=')[0]] = option + + return list(deduped.values()) class AttributeDict(dict): From d8cf09c21e2d6e3ece8a7e5c93d537596c3687d0 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Tue, 5 Mar 2024 21:13:45 -0600 Subject: [PATCH 5/6] Remove unnecessary variable length args for dedupe --- archivebox/extractors/archive_org.py | 2 +- archivebox/extractors/favicon.py | 2 +- archivebox/extractors/headers.py | 2 +- archivebox/extractors/media.py | 2 +- archivebox/extractors/singlefile.py | 2 +- archivebox/extractors/title.py | 2 +- archivebox/extractors/wget.py | 2 +- archivebox/util.py | 4 ++-- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 0d45534a..245315f1 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -57,7 +57,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= ] cmd = [ CURL_BINARY, - *dedupe(*options), + *dedupe(options), submit_url, ] status = 'succeeded' diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index fffa3d16..f793f8df 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -50,7 +50,7 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ] cmd = [ CURL_BINARY, - *dedupe(*options), + *dedupe(options), FAVICON_PROVIDER.format(domain(link.url)), ] status = 'failed' diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 9be14331..975787ad 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -53,7 +53,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ] cmd = [ CURL_BINARY, - *dedupe(*options), + *dedupe(options), link.url, ] try: diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index a6d4e81f..ad4c9c4b 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -50,7 +50,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME ] cmd = [ YOUTUBEDL_BINARY, - *dedupe(*options), + *dedupe(options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 5021a6cc..553c9f8d 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -57,7 +57,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO ] cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], - *dedupe(*options), + *dedupe(options), link.url, output, ] diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 4f34ca81..5decc52c 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -114,7 +114,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - ] cmd = [ CURL_BINARY, - *dedupe(*options), + *dedupe(options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 885e31f5..07471e29 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -74,7 +74,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ] cmd = [ WGET_BINARY, - *dedupe(*options), + *dedupe(options), link.url, ] diff --git a/archivebox/util.py b/archivebox/util.py index 10ceebd4..e1707049 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -287,7 +287,7 @@ def chrome_args(**options) -> List[str]: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - return dedupe(*cmd_args) + return dedupe(cmd_args) def chrome_cleanup(): """ @@ -325,7 +325,7 @@ def ansi_to_html(text): @enforce_types -def dedupe(*options: str) -> List[str]: +def dedupe(options: List[str]) -> List[str]: """ Deduplicates the given options. Options that come later clobber earlier conflicting options. From f4deb97f59abffae4faa5f93a5108c9f28cb09f3 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Tue, 5 Mar 2024 21:15:38 -0600 Subject: [PATCH 6/6] Add `ARGS` and `EXTRA_ARGS` for Mercury extractor --- archivebox/config.py | 4 ++++ archivebox/extractors/mercury.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index f8e56036..64b07931 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -199,6 +199,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, 'SINGLEFILE_ARGS': {'type': list, 'default': None}, 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None}, + 'MERCURY_ARGS': {'type': list, 'default': ['--format=text']}, + 'MERCURY_EXTRA_ARGS': {'type': list, 'default': None}, 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, }, @@ -561,6 +563,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750 + 'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []}, + 'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []}, 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index e7d20362..a0f38434 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -11,13 +11,15 @@ from ..system import run, atomic_write from ..util import ( enforce_types, is_static_file, - + dedupe, ) from ..config import ( TIMEOUT, SAVE_MERCURY, DEPENDENCIES, MERCURY_VERSION, + MERCURY_ARGS, + MERCURY_EXTRA_ARGS, ) from ..logging_util import TimedProgress @@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) timer = TimedProgress(timeout, prefix=' ') try: output_folder.mkdir(exist_ok=True) - - # Get plain text version of article + # later options take precedence + options = [ + *MERCURY_ARGS, + *MERCURY_EXTRA_ARGS, + ] + # By default, get plain text version of article cmd = [ DEPENDENCIES['MERCURY_BINARY']['path'], link.url, - "--format=text" + *dedupe(options) ] result = run(cmd, cwd=out_dir, timeout=timeout) try: