From 4e69d2c9e14bbbc4597731fdc349f5461a726b54 Mon Sep 17 00:00:00 2001
From: Ben Muthalaly <benmuthalaly@gmail.com>
Date: Wed, 21 Feb 2024 15:13:06 -0600
Subject: [PATCH 1/6] Add `EXTRA_*_ARGS` for wget, curl, and singlefile

---
 archivebox/config.py                 |  8 +++++++-
 archivebox/extractors/archive_org.py | 13 ++++++++++---
 archivebox/extractors/favicon.py     | 18 ++++++++++++++----
 archivebox/extractors/headers.py     | 14 ++++++++++----
 archivebox/extractors/singlefile.py  | 25 +++++++++----------------
 archivebox/extractors/title.py       | 13 ++++++++++---
 archivebox/extractors/wget.py        | 15 +++++++++++----
 archivebox/util.py                   | 17 +++++++++++++++++
 8 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index 1edd2eeb..ebb939a4 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -187,12 +187,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--no-parent',
                                                                 '-e', 'robots=off',
                                                                 ]},
+        'WGET_EXTRA_ARGS':          {'type': list,  'default': None},
         'CURL_ARGS':                {'type': list,  'default': ['--silent',
                                                                 '--location',
                                                                 '--compressed'
                                                                ]},
+        'CURL_EXTRA_ARGS':          {'type': list,  'default': None},
         'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
-        'SINGLEFILE_ARGS':          {'type': list,  'default' : None},
+        'SINGLEFILE_ARGS':          {'type': list,  'default': None},
+        'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
         'FAVICON_PROVIDER':         {'type': str,   'default': 'https://www.google.com/s2/favicons?domain={}'},
     },
 
@@ -530,6 +533,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
     'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
     'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
+    'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
     'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
     'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
 
@@ -540,12 +544,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
     'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
+    'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
 
     'RIPGREP_VERSION':          {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
 
     'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
     'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
     'SINGLEFILE_ARGS':          {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
+    'SINGLEFILE_EXTRA_ARGS':    {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
 
     'USE_READABILITY':          {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
     'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py
index a0883113..93730f26 100644
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -10,10 +10,12 @@ from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
     CURL_ARGS,
+    CURL_EXTRA_ARGS,
     CHECK_SSL_VALIDITY,
     SAVE_ARCHIVE_DOT_ORG,
     CURL_BINARY,
@@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
     output: ArchiveOutput = 'archive.org.txt'
     archive_org_url = None
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
-    cmd = [
-        CURL_BINARY,
-        *CURL_ARGS,
+    # earlier options take precedence
+    options = [
         '--head',
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *CURL_EXTRA_ARGS,
+        *CURL_ARGS,
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(*options),
         submit_url,
     ]
     status = 'succeeded'
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
index 5baafc17..3b41f349 100644
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -6,13 +6,18 @@ from typing import Optional
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..system import chmod_file, run
-from ..util import enforce_types, domain
+from ..util import (
+    enforce_types,
+     domain,
+     dedupe,
+)
 from ..config import (
     TIMEOUT,
     SAVE_FAVICON,
     FAVICON_PROVIDER,
     CURL_BINARY,
     CURL_ARGS,
+    CURL_EXTRA_ARGS,
     CURL_VERSION,
     CHECK_SSL_VALIDITY,
     CURL_USER_AGENT,
@@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
     out_dir = out_dir or link.link_dir
     output: ArchiveOutput = 'favicon.ico'
-    cmd = [
-        CURL_BINARY,
-        *CURL_ARGS,
+    # earlier options take precedence
+    options = [
         '--max-time', str(timeout),
         '--output', str(output),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *CURL_EXTRA_ARGS,
+        *CURL_ARGS,
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(*options),
         FAVICON_PROVIDER.format(domain(link.url)),
     ]
     status = 'failed'
diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py
index 91dcb8e3..3828de93 100644
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -9,11 +9,13 @@ from ..system import atomic_write
 from ..util import (
     enforce_types,
     get_headers,
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
     CURL_BINARY,
     CURL_ARGS,
+    CURL_EXTRA_ARGS,
     CURL_USER_AGENT,
     CURL_VERSION,
     CHECK_SSL_VALIDITY,
@@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
-
-    cmd = [
-        CURL_BINARY,
-        *CURL_ARGS,
+    # earlier options take precedence
+    options = [
         '--head',
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *CURL_EXTRA_ARGS,
+        *CURL_ARGS,
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(*options),
         link.url,
     ]
     try:
diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py
index e50b3932..b2119119 100644
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -11,6 +11,7 @@ from ..util import (
     enforce_types,
     is_static_file,
     chrome_args,
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
@@ -18,6 +19,7 @@ from ..config import (
     DEPENDENCIES,
     SINGLEFILE_VERSION,
     SINGLEFILE_ARGS,
+    SINGLEFILE_EXTRA_ARGS,
     CHROME_BINARY,
 )
 from ..logging_util import TimedProgress
@@ -46,11 +48,6 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
 
     # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
     browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
-    options = [
-        *SINGLEFILE_ARGS,
-        '--browser-executable-path={}'.format(CHROME_BINARY),
-        browser_args,
-    ]
 
     # Deduplicate options (single-file doesn't like when you use the same option two times)
     #
@@ -58,19 +55,15 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most 
     # specificity, therefore the user sets it with a lot intent, therefore it should take precedence 
     # kind of like the ergonomic principle of lexical scope in programming languages.
-    seen_option_names = []
-    def test_seen(argument):
-        option_name = argument.split("=")[0]
-        if option_name in seen_option_names:
-            return False
-        else:
-            seen_option_names.append(option_name)
-            return True
-    deduped_options = list(filter(test_seen, options))
-
+    options = [
+        '--browser-executable-path={}'.format(CHROME_BINARY),
+        browser_args,
+        *SINGLEFILE_EXTRA_ARGS,
+        *SINGLEFILE_ARGS,
+    ]
     cmd = [
         DEPENDENCIES['SINGLEFILE_BINARY']['path'],
-        *deduped_options,
+        *dedupe(*options),
         link.url,
         output,
     ]
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index 6b0e37f6..b2b65af2 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -10,6 +10,7 @@ from ..util import (
     enforce_types,
     download_url,
     htmldecode,
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
@@ -17,6 +18,7 @@ from ..config import (
     SAVE_TITLE,
     CURL_BINARY,
     CURL_ARGS,
+    CURL_EXTRA_ARGS,
     CURL_VERSION,
     CURL_USER_AGENT,
 )
@@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
     from core.models import Snapshot
 
     output: ArchiveOutput = None
-    cmd = [
-        CURL_BINARY,
-        *CURL_ARGS,
+    # earlier options take precedence
+    options = [
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *CURL_EXTRA_ARGS,
+        *CURL_ARGS,
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(*options),
         link.url,
     ]
     status = 'succeeded'
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index f3057271..d50409b6 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -15,9 +15,11 @@ from ..util import (
     path,
     domain,
     urldecode,
+    dedupe,
 )
 from ..config import (
     WGET_ARGS,
+    WGET_EXTRA_ARGS,
     TIMEOUT,
     SAVE_WGET,
     SAVE_WARC,
@@ -55,10 +57,8 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
 
     # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
     output: ArchiveOutput = None
-    cmd = [
-        WGET_BINARY,
-        # '--server-response',  # print headers for better error parsing
-        *WGET_ARGS,
+    # earlier options take precedence
+    options = [
         '--timeout={}'.format(timeout),
         *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
         *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@@ -68,6 +68,13 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
         *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
         *([] if SAVE_WARC else ['--timestamping']),
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
+        # '--server-response',  # print headers for better error parsing
+        *WGET_EXTRA_ARGS, 
+        *WGET_ARGS,
+    ]
+    cmd = [
+        WGET_BINARY,
+        *dedupe(*options),
         link.url,
     ]
 
diff --git a/archivebox/util.py b/archivebox/util.py
index 5321081c..6b31c86e 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -317,6 +317,23 @@ def ansi_to_html(text):
     return COLOR_REGEX.sub(single_sub, text)
 
 
+@enforce_types
+def dedupe(*options: List[str]) -> List[str]:
+    """
+    Deduplicates the given options. Options that come earlier in the list clobber
+    later conflicting options.
+    """
+    seen_option_names = []
+    def test_seen(argument):
+        option_name = argument.split("=")[0]
+        if option_name in seen_option_names:
+            return False
+        else:
+            seen_option_names.append(option_name)
+            return True
+    return list(filter(test_seen, options))
+
+
 class AttributeDict(dict):
     """Helper to allow accessing dict values via Example.key or Example['key']"""
 

From ab8f395e0a4104dd01385be3d8fcea082a6987ee Mon Sep 17 00:00:00 2001
From: Ben Muthalaly <benmuthalaly@gmail.com>
Date: Fri, 23 Feb 2024 15:40:31 -0600
Subject: [PATCH 2/6] Add `YOUTUBEDL_EXTRA_ARGS`

---
 archivebox/config.py           |  1 +
 archivebox/extractors/media.py | 12 +++++++++---
 archivebox/extractors/wget.py  |  2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index ebb939a4..00e3b9f0 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -176,6 +176,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--add-metadata',
                                                                 '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
                                                                 ]},
+        'YOUTUBEDL_EXTRA_ARGS':     {'type': list,  'default': None},
 
 
         'WGET_ARGS':                {'type': list,  'default': ['--no-verbose',
diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py
index 7d73024f..862bb758 100644
--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@@ -8,11 +8,13 @@ from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
+    dedupe,
 )
 from ..config import (
     MEDIA_TIMEOUT,
     SAVE_MEDIA,
     YOUTUBEDL_ARGS,
+    YOUTUBEDL_EXTRA_ARGS,
     YOUTUBEDL_BINARY,
     YOUTUBEDL_VERSION,
     CHECK_SSL_VALIDITY
@@ -39,11 +41,15 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
     output: ArchiveOutput = 'media'
     output_path = out_dir / output
     output_path.mkdir(exist_ok=True)
-    cmd = [
-        YOUTUBEDL_BINARY,
-        *YOUTUBEDL_ARGS,
+    options = [
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
         # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
+        *YOUTUBEDL_EXTRA_ARGS,
+        *YOUTUBEDL_ARGS,
+    ]
+    cmd = [
+        YOUTUBEDL_BINARY,
+        *dedupe(*options),
         link.url,
     ]
     status = 'succeeded'
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index d50409b6..5209cde9 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -69,7 +69,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
         *([] if SAVE_WARC else ['--timestamping']),
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
         # '--server-response',  # print headers for better error parsing
-        *WGET_EXTRA_ARGS, 
+        *WGET_EXTRA_ARGS,
         *WGET_ARGS,
     ]
     cmd = [

From 4d9c5a7b4b0bc0f490b6d8928878853fad363d16 Mon Sep 17 00:00:00 2001
From: Ben Muthalaly <benmuthalaly@gmail.com>
Date: Fri, 23 Feb 2024 18:40:03 -0600
Subject: [PATCH 3/6] Add `CHROME_EXTRA_ARGS`

Also fix `YOUTUBEDL_EXTRA_ARGS`.
---
 archivebox/config.py |  4 ++++
 archivebox/util.py   | 12 +++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index 00e3b9f0..f8e56036 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -152,6 +152,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'CHROME_TIMEOUT':           {'type': int,   'default': 0},
         'CHROME_HEADLESS':          {'type': bool,  'default': True},
         'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
+        'CHROME_EXTRA_ARGS':        {'type': list,  'default': None},
+
         'YOUTUBEDL_ARGS':           {'type': list,  'default': lambda c: [
                                                                 '--restrict-filenames',
                                                                 '--trim-filenames', '128',
@@ -568,6 +570,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'YOUTUBEDL_VERSION':        {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
     'SAVE_MEDIA':               {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
     'YOUTUBEDL_ARGS':           {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
+    'YOUTUBEDL_EXTRA_ARGS':     {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
 
     'CHROME_BINARY':            {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
     'USE_CHROME':               {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
@@ -589,6 +592,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'EXTERNAL_LOCATIONS':       {'default': lambda c: get_external_locations(c)},
     'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
     'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
+    'CHROME_EXTRA_ARGS':        {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
     'SAVE_ALLOWLIST_PTN':       {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
     'SAVE_DENYLIST_PTN':        {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
 }
diff --git a/archivebox/util.py b/archivebox/util.py
index 6b31c86e..18ca08aa 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -227,7 +227,11 @@ def chrome_args(**options) -> List[str]:
 
     # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
 
-    from .config import CHROME_OPTIONS, CHROME_VERSION
+    from .config import (
+        CHROME_OPTIONS,
+        CHROME_VERSION,
+        CHROME_EXTRA_ARGS,
+    )
 
     options = {**CHROME_OPTIONS, **options}
 
@@ -279,8 +283,10 @@ def chrome_args(**options) -> List[str]:
 
     if options['CHROME_USER_DATA_DIR']:
         cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
-    
-    return cmd_args
+
+    cmd_args += CHROME_EXTRA_ARGS
+
+    return dedupe(*cmd_args)
 
 def chrome_cleanup():
     """

From d74ddd42ae104004e656929036c55f972a9d63d4 Mon Sep 17 00:00:00 2001
From: Ben Muthalaly <benmuthalaly@gmail.com>
Date: Fri, 1 Mar 2024 14:50:32 -0600
Subject: [PATCH 4/6] Flip dedupe precedence order

---
 archivebox/extractors/archive_org.py |  6 +++---
 archivebox/extractors/favicon.py     |  6 +++---
 archivebox/extractors/headers.py     |  6 +++---
 archivebox/extractors/media.py       |  5 +++--
 archivebox/extractors/singlefile.py  | 14 ++++----------
 archivebox/extractors/title.py       |  6 +++---
 archivebox/extractors/wget.py        |  6 +++---
 archivebox/util.py                   | 24 +++++++++++-------------
 8 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py
index 93730f26..0d45534a 100644
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -46,14 +46,14 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
     output: ArchiveOutput = 'archive.org.txt'
     archive_org_url = None
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
-    # earlier options take precedence
+    # later options take precedence
     options = [
+        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
         '--head',
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        *CURL_EXTRA_ARGS,
-        *CURL_ARGS,
     ]
     cmd = [
         CURL_BINARY,
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
index 3b41f349..fffa3d16 100644
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -39,14 +39,14 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
     out_dir = out_dir or link.link_dir
     output: ArchiveOutput = 'favicon.ico'
-    # earlier options take precedence
+    # later options take precedence
     options = [
+        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
         '--max-time', str(timeout),
         '--output', str(output),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        *CURL_EXTRA_ARGS,
-        *CURL_ARGS,
     ]
     cmd = [
         CURL_BINARY,
diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py
index 3828de93..9be14331 100644
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -42,14 +42,14 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
-    # earlier options take precedence
+    # later options take precedence
     options = [
+        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
         '--head',
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        *CURL_EXTRA_ARGS,
-        *CURL_ARGS,
     ]
     cmd = [
         CURL_BINARY,
diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py
index 862bb758..a6d4e81f 100644
--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@@ -41,11 +41,12 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
     output: ArchiveOutput = 'media'
     output_path = out_dir / output
     output_path.mkdir(exist_ok=True)
+    # later options take precedence
     options = [
+        *YOUTUBEDL_ARGS,
+        *YOUTUBEDL_EXTRA_ARGS,
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
         # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
-        *YOUTUBEDL_EXTRA_ARGS,
-        *YOUTUBEDL_ARGS,
     ]
     cmd = [
         YOUTUBEDL_BINARY,
diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py
index b2119119..5021a6cc 100644
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -48,18 +48,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
 
     # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
     browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
-
-    # Deduplicate options (single-file doesn't like when you use the same option two times)
-    #
-    # NOTE: Options names that come first clobber conflicting names that come later
-    # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most 
-    # specificity, therefore the user sets it with a lot intent, therefore it should take precedence 
-    # kind of like the ergonomic principle of lexical scope in programming languages.
+    # later options take precedence
     options = [
-        '--browser-executable-path={}'.format(CHROME_BINARY),
-        browser_args,
-        *SINGLEFILE_EXTRA_ARGS,
         *SINGLEFILE_ARGS,
+        *SINGLEFILE_EXTRA_ARGS,
+        browser_args,
+        '--browser-executable-path={}'.format(CHROME_BINARY),
     ]
     cmd = [
         DEPENDENCIES['SINGLEFILE_BINARY']['path'],
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index b2b65af2..4f34ca81 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -104,13 +104,13 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
     from core.models import Snapshot
 
     output: ArchiveOutput = None
-    # earlier options take precedence
+    # later options take precedence
     options = [
+        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        *CURL_EXTRA_ARGS,
-        *CURL_ARGS,
     ]
     cmd = [
         CURL_BINARY,
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index 5209cde9..885e31f5 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -57,8 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
 
     # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
     output: ArchiveOutput = None
-    # earlier options take precedence
+    # later options take precedence
     options = [
+        *WGET_ARGS,
+        *WGET_EXTRA_ARGS,
         '--timeout={}'.format(timeout),
         *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
         *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@@ -69,8 +71,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
         *([] if SAVE_WARC else ['--timestamping']),
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
         # '--server-response',  # print headers for better error parsing
-        *WGET_EXTRA_ARGS,
-        *WGET_ARGS,
     ]
     cmd = [
         WGET_BINARY,
diff --git a/archivebox/util.py b/archivebox/util.py
index 18ca08aa..10ceebd4 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -240,6 +240,8 @@ def chrome_args(**options) -> List[str]:
 
     cmd_args = [options['CHROME_BINARY']]
 
+    cmd_args += CHROME_EXTRA_ARGS
+
     if options['CHROME_HEADLESS']:
         chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
         if chrome_major_version >= 111:
@@ -284,7 +286,6 @@ def chrome_args(**options) -> List[str]:
     if options['CHROME_USER_DATA_DIR']:
         cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
 
-    cmd_args += CHROME_EXTRA_ARGS
 
     return dedupe(*cmd_args)
 
@@ -324,20 +325,17 @@ def ansi_to_html(text):
 
 
 @enforce_types
-def dedupe(*options: List[str]) -> List[str]:
+def dedupe(*options: str) -> List[str]:
     """
-    Deduplicates the given options. Options that come earlier in the list clobber
-    later conflicting options.
+    Deduplicates the given options. Options that come later clobber earlier
+    conflicting options.
     """
-    seen_option_names = []
-    def test_seen(argument):
-        option_name = argument.split("=")[0]
-        if option_name in seen_option_names:
-            return False
-        else:
-            seen_option_names.append(option_name)
-            return True
-    return list(filter(test_seen, options))
+    deduped = {}
+
+    for option in options:
+        deduped[option.split('=')[0]] = option
+
+    return list(deduped.values())
 
 
 class AttributeDict(dict):

From d8cf09c21e2d6e3ece8a7e5c93d537596c3687d0 Mon Sep 17 00:00:00 2001
From: Ben Muthalaly <benmuthalaly@gmail.com>
Date: Tue, 5 Mar 2024 21:13:45 -0600
Subject: [PATCH 5/6] Remove unnecessary variable length args for dedupe

---
 archivebox/extractors/archive_org.py | 2 +-
 archivebox/extractors/favicon.py     | 2 +-
 archivebox/extractors/headers.py     | 2 +-
 archivebox/extractors/media.py       | 2 +-
 archivebox/extractors/singlefile.py  | 2 +-
 archivebox/extractors/title.py       | 2 +-
 archivebox/extractors/wget.py        | 2 +-
 archivebox/util.py                   | 4 ++--
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py
index 0d45534a..245315f1 100644
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -57,7 +57,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
     ]
     cmd = [
         CURL_BINARY,
-        *dedupe(*options),
+        *dedupe(options),
         submit_url,
     ]
     status = 'succeeded'
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
index fffa3d16..f793f8df 100644
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -50,7 +50,7 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
     ]
     cmd = [
         CURL_BINARY,
-        *dedupe(*options),
+        *dedupe(options),
         FAVICON_PROVIDER.format(domain(link.url)),
     ]
     status = 'failed'
diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py
index 9be14331..975787ad 100644
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -53,7 +53,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
     ]
     cmd = [
         CURL_BINARY,
-        *dedupe(*options),
+        *dedupe(options),
         link.url,
     ]
     try:
diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py
index a6d4e81f..ad4c9c4b 100644
--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@@ -50,7 +50,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
     ]
     cmd = [
         YOUTUBEDL_BINARY,
-        *dedupe(*options),
+        *dedupe(options),
         link.url,
     ]
     status = 'succeeded'
diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py
index 5021a6cc..553c9f8d 100644
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -57,7 +57,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     ]
     cmd = [
         DEPENDENCIES['SINGLEFILE_BINARY']['path'],
-        *dedupe(*options),
+        *dedupe(options),
         link.url,
         output,
     ]
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index 4f34ca81..5decc52c 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -114,7 +114,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
     ]
     cmd = [
         CURL_BINARY,
-        *dedupe(*options),
+        *dedupe(options),
         link.url,
     ]
     status = 'succeeded'
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index 885e31f5..07471e29 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -74,7 +74,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     ]
     cmd = [
         WGET_BINARY,
-        *dedupe(*options),
+        *dedupe(options),
         link.url,
     ]
 
diff --git a/archivebox/util.py b/archivebox/util.py
index 10ceebd4..e1707049 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -287,7 +287,7 @@ def chrome_args(**options) -> List[str]:
         cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
 
 
-    return dedupe(*cmd_args)
+    return dedupe(cmd_args)
 
 def chrome_cleanup():
     """
@@ -325,7 +325,7 @@ def ansi_to_html(text):
 
 
 @enforce_types
-def dedupe(*options: str) -> List[str]:
+def dedupe(options: List[str]) -> List[str]:
     """
     Deduplicates the given options. Options that come later clobber earlier
     conflicting options.

From f4deb97f59abffae4faa5f93a5108c9f28cb09f3 Mon Sep 17 00:00:00 2001
From: Ben Muthalaly <benmuthalaly@gmail.com>
Date: Tue, 5 Mar 2024 21:15:38 -0600
Subject: [PATCH 6/6] Add `ARGS` and `EXTRA_ARGS` for Mercury extractor

---
 archivebox/config.py             |  4 ++++
 archivebox/extractors/mercury.py | 14 ++++++++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index f8e56036..64b07931 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -199,6 +199,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
         'SINGLEFILE_ARGS':          {'type': list,  'default': None},
         'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
+        'MERCURY_ARGS':             {'type': list,  'default': ['--format=text']},
+        'MERCURY_EXTRA_ARGS':       {'type': list,  'default': None},
         'FAVICON_PROVIDER':         {'type': str,   'default': 'https://www.google.com/s2/favicons?domain={}'},
     },
 
@@ -561,6 +563,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 
     'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
     'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
+    'MERCURY_ARGS':             {'default': lambda c: c['MERCURY_ARGS'] or []},
+    'MERCURY_EXTRA_ARGS':       {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
 
     'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
     'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py
index e7d20362..a0f38434 100644
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@@ -11,13 +11,15 @@ from ..system import run, atomic_write
 from ..util import (
     enforce_types,
     is_static_file,
-
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
     SAVE_MERCURY,
     DEPENDENCIES,
     MERCURY_VERSION,
+    MERCURY_ARGS,
+    MERCURY_EXTRA_ARGS,
 )
 from ..logging_util import TimedProgress
 
@@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
     timer = TimedProgress(timeout, prefix='      ')
     try:
         output_folder.mkdir(exist_ok=True)
-
-        # Get plain text version of article
+        # later options take precedence
+        options = [
+            *MERCURY_ARGS,
+            *MERCURY_EXTRA_ARGS,
+        ]
+        # By default, get plain text version of article
         cmd = [
             DEPENDENCIES['MERCURY_BINARY']['path'],
             link.url,
-            "--format=text"
+            *dedupe(options)
         ]
         result = run(cmd, cwd=out_dir, timeout=timeout)
         try: