diff --git a/archivebox/config.py b/archivebox/config.py index 0da43fb0..44512562 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -177,6 +177,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--compressed' ]}, 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, + 'SINGLEFILE_ARGS': {'type': list, 'default' : None} }, 'SEARCH_BACKEND_CONFIG' : { @@ -402,6 +403,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []}, 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 3279960e..f7b1b686 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -17,6 +17,7 @@ from ..config import ( SAVE_SINGLEFILE, DEPENDENCIES, SINGLEFILE_VERSION, + SINGLEFILE_ARGS, CHROME_BINARY, ) from ..logging_util import TimedProgress @@ -45,10 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - cmd = [ - DEPENDENCIES['SINGLEFILE_BINARY']['path'], + options = [ + *SINGLEFILE_ARGS, '--browser-executable-path={}'.format(CHROME_BINARY), browser_args, + ] + + # Deduplicate options (single-file doesn't like when you use the same option two times) + # + # NOTE: Options names that come first clobber conflicting names that come later + # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most + # specificity, therefore the user sets it with a lot intent, therefore it should take precedence + # kind of like the ergonomic principle of lexical scope in programming languages. + seen_option_names = [] + def test_seen(argument): + option_name = argument.split("=")[0] + if option_name in seen_option_names: + return False + else: + seen_option_names.append(option_name) + return True + deduped_options = list(filter(test_seen, options)) + + cmd = [ + DEPENDENCIES['SINGLEFILE_BINARY']['path'], + *deduped_options, link.url, output, ]