From 8899fe0b9259748da2ef19d37028c317f39f37d3 Mon Sep 17 00:00:00 2001 From: renaisun <43090234+renaisun@users.noreply.github.com> Date: Thu, 9 Jun 2022 14:35:48 +0800 Subject: [PATCH 1/3] Add SINGLEFILE_ARGS to control single-file arguments --- archivebox/config.py | 2 ++ archivebox/extractors/singlefile.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/archivebox/config.py b/archivebox/config.py index cfe41b53..bd3a1688 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -177,6 +177,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--compressed' ]}, 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, + 'SINGLEFILE_ARGS': {'type': list, 'default' : None} }, 'SEARCH_BACKEND_CONFIG' : { @@ -391,6 +392,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []}, 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 3279960e..80ad90b1 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -17,6 +17,7 @@ from ..config import ( SAVE_SINGLEFILE, DEPENDENCIES, SINGLEFILE_VERSION, + SINGLEFILE_ARGS, CHROME_BINARY, ) from ..logging_util import TimedProgress @@ -47,6 +48,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], + *SINGLEFILE_ARGS, '--browser-executable-path={}'.format(CHROME_BINARY), browser_args, link.url, From 40659b5e9d345309515873f61e07c213f6b21ac8 Mon Sep 17 00:00:00 2001 From: notevenaperson <66701832+notevenaperson@users.noreply.github.com> Date: Sun, 11 Sep 2022 17:23:15 +0000 Subject: [PATCH 2/3] singlefile.py: Code to ensure options are deduplicated --- archivebox/extractors/singlefile.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 80ad90b1..f29d59c3 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -46,11 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - cmd = [ - DEPENDENCIES['SINGLEFILE_BINARY']['path'], + options = [ *SINGLEFILE_ARGS, '--browser-executable-path={}'.format(CHROME_BINARY), browser_args, + ] + + # Deduplicate options (single-file doesn't like when you use the same option two times) + # + # NOTE: Options names that come first clobber conflicting names that come later + # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most + # specificity, therefore the user sets it with a lot intent, therefore it should take precedence + # kind of like the ergonomic principle of lexical scope in programming languages. + seen_option_names = [] + def test_seen(argument): + option_name = argument.split("=")[0] + if option_name in seen_option_names: + return False + else: + seen_option_names.append(option_name) + return True + deduped_options = list(filter(test_seen, options)) + + cmd = [ + DEPENDENCIES['SINGLEFILE_BINARY']['path'], + *deduped_options link.url, output, ] From 0ea955b3edbb9d8fdf60f40b448b7653ff20ada3 Mon Sep 17 00:00:00 2001 From: renaisun <43090234+renaisun@users.noreply.github.com> Date: Mon, 12 Sep 2022 08:54:15 +0800 Subject: [PATCH 3/3] add a missing comma --- archivebox/extractors/singlefile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index f29d59c3..f7b1b686 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -70,7 +70,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], - *deduped_options + *deduped_options, link.url, output, ]