From ab8f395e0a4104dd01385be3d8fcea082a6987ee Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Fri, 23 Feb 2024 15:40:31 -0600 Subject: [PATCH] Add `YOUTUBEDL_EXTRA_ARGS` --- archivebox/config.py | 1 + archivebox/extractors/media.py | 12 +++++++++--- archivebox/extractors/wget.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index ebb939a4..00e3b9f0 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -176,6 +176,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--add-metadata', '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']), ]}, + 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None}, 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 7d73024f..862bb758 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -8,11 +8,13 @@ from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, + dedupe, ) from ..config import ( MEDIA_TIMEOUT, SAVE_MEDIA, YOUTUBEDL_ARGS, + YOUTUBEDL_EXTRA_ARGS, YOUTUBEDL_BINARY, YOUTUBEDL_VERSION, CHECK_SSL_VALIDITY @@ -39,11 +41,15 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME output: ArchiveOutput = 'media' output_path = out_dir / output output_path.mkdir(exist_ok=True) - cmd = [ - YOUTUBEDL_BINARY, - *YOUTUBEDL_ARGS, + options = [ *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} + *YOUTUBEDL_EXTRA_ARGS, + *YOUTUBEDL_ARGS, + ] + cmd = [ + YOUTUBEDL_BINARY, + *dedupe(*options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index d50409b6..5209cde9 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -69,7 +69,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), # '--server-response', # print headers for better error parsing - *WGET_EXTRA_ARGS, + *WGET_EXTRA_ARGS, *WGET_ARGS, ] cmd = [