diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 93730f26..0d45534a 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -46,14 +46,14 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= output: ArchiveOutput = 'archive.org.txt' archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 3b41f349..fffa3d16 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -39,14 +39,14 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) out_dir = out_dir or link.link_dir output: ArchiveOutput = 'favicon.ico' - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--max-time', str(timeout), '--output', str(output), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 3828de93..9be14331 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -42,14 +42,14 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 862bb758..a6d4e81f 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -41,11 +41,12 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME output: ArchiveOutput = 'media' output_path = out_dir / output output_path.mkdir(exist_ok=True) + # later options take precedence options = [ + *YOUTUBEDL_ARGS, + *YOUTUBEDL_EXTRA_ARGS, *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} - *YOUTUBEDL_EXTRA_ARGS, - *YOUTUBEDL_ARGS, ] cmd = [ YOUTUBEDL_BINARY, diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index b2119119..5021a6cc 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -48,18 +48,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - - # Deduplicate options (single-file doesn't like when you use the same option two times) - # - # NOTE: Options names that come first clobber conflicting names that come later - # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most - # specificity, therefore the user sets it with a lot intent, therefore it should take precedence - # kind of like the ergonomic principle of lexical scope in programming languages. + # later options take precedence options = [ - '--browser-executable-path={}'.format(CHROME_BINARY), - browser_args, - *SINGLEFILE_EXTRA_ARGS, *SINGLEFILE_ARGS, + *SINGLEFILE_EXTRA_ARGS, + browser_args, + '--browser-executable-path={}'.format(CHROME_BINARY), ] cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index b2b65af2..4f34ca81 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -104,13 +104,13 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - from core.models import Snapshot output: ArchiveOutput = None - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 5209cde9..885e31f5 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -57,8 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html output: ArchiveOutput = None - # earlier options take precedence + # later options take precedence options = [ + *WGET_ARGS, + *WGET_EXTRA_ARGS, '--timeout={}'.format(timeout), *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), @@ -69,8 +71,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), # '--server-response', # print headers for better error parsing - *WGET_EXTRA_ARGS, - *WGET_ARGS, ] cmd = [ WGET_BINARY, diff --git a/archivebox/util.py b/archivebox/util.py index 18ca08aa..10ceebd4 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -240,6 +240,8 @@ def chrome_args(**options) -> List[str]: cmd_args = [options['CHROME_BINARY']] + cmd_args += CHROME_EXTRA_ARGS + if options['CHROME_HEADLESS']: chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1]) if chrome_major_version >= 111: @@ -284,7 +286,6 @@ def chrome_args(**options) -> List[str]: if options['CHROME_USER_DATA_DIR']: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - cmd_args += CHROME_EXTRA_ARGS return dedupe(*cmd_args) @@ -324,20 +325,17 @@ def ansi_to_html(text): @enforce_types -def dedupe(*options: List[str]) -> List[str]: +def dedupe(*options: str) -> List[str]: """ - Deduplicates the given options. Options that come earlier in the list clobber - later conflicting options. + Deduplicates the given options. Options that come later clobber earlier + conflicting options. """ - seen_option_names = [] - def test_seen(argument): - option_name = argument.split("=")[0] - if option_name in seen_option_names: - return False - else: - seen_option_names.append(option_name) - return True - return list(filter(test_seen, options)) + deduped = {} + + for option in options: + deduped[option.split('=')[0]] = option + + return list(deduped.values()) class AttributeDict(dict):