From f4deb97f59abffae4faa5f93a5108c9f28cb09f3 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Tue, 5 Mar 2024 21:15:38 -0600 Subject: [PATCH] Add `ARGS` and `EXTRA_ARGS` for Mercury extractor --- archivebox/config.py | 4 ++++ archivebox/extractors/mercury.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index f8e56036..64b07931 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -199,6 +199,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, 'SINGLEFILE_ARGS': {'type': list, 'default': None}, 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None}, + 'MERCURY_ARGS': {'type': list, 'default': ['--format=text']}, + 'MERCURY_EXTRA_ARGS': {'type': list, 'default': None}, 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, }, @@ -561,6 +563,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750 + 'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []}, + 'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []}, 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index e7d20362..a0f38434 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -11,13 +11,15 @@ from ..system import run, atomic_write from ..util import ( enforce_types, is_static_file, - + dedupe, ) from ..config import ( TIMEOUT, SAVE_MERCURY, DEPENDENCIES, MERCURY_VERSION, + MERCURY_ARGS, + MERCURY_EXTRA_ARGS, ) from ..logging_util import TimedProgress @@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) timer = TimedProgress(timeout, prefix=' ') try: output_folder.mkdir(exist_ok=True) - - # Get plain text version of article + # later options take precedence + options = [ + *MERCURY_ARGS, + *MERCURY_EXTRA_ARGS, + ] + # By default, get plain text version of article cmd = [ DEPENDENCIES['MERCURY_BINARY']['path'], link.url, - "--format=text" + *dedupe(options) ] result = run(cmd, cwd=out_dir, timeout=timeout) try: