diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 5c370fa5..b4e65231 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -62,10 +62,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Re-archive URLs from scratch, overwriting any existing files" ) parser.add_argument( - '--init', #'-i', + "--init", #'-i', action='store_true', help="Init/upgrade the curent data directory before adding", ) + parser.add_argument( + "--extract", + type=str, + help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ + This does not take precedence over the configuration", + default="" + ) command = parser.parse_args(args or ()) urls = command.urls stdin_urls = accept_stdin(stdin) @@ -83,6 +90,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional overwrite=command.overwrite, init=command.init, out_dir=pwd or OUTPUT_DIR, + extractors=command.extract, ) diff --git a/archivebox/main.py b/archivebox/main.py index 44ee6b14..e27dff96 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -525,11 +525,14 @@ def add(urls: Union[str, List[str]], index_only: bool=False, overwrite: bool=False, init: bool=False, - out_dir: Path=OUTPUT_DIR) -> List[Link]: + out_dir: Path=OUTPUT_DIR, + extractors: str="") -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' + extractors = extractors.split(",") if extractors else [] + if init: run_subcommand('init', stdin=None, pwd=out_dir) @@ -567,12 +570,17 @@ def add(urls: Union[str, List[str]], return all_links # Run the archive methods for each link + archive_kwargs = { + "out_dir": out_dir, + } + if extractors: + archive_kwargs["methods"] = extractors if update_all: - archive_links(all_links, overwrite=overwrite, out_dir=out_dir) + archive_links(all_links, overwrite=overwrite, **archive_kwargs) elif overwrite: - archive_links(imported_links, overwrite=True, out_dir=out_dir) + archive_links(imported_links, overwrite=True, **archive_kwargs) elif new_links: - archive_links(new_links, overwrite=False, out_dir=out_dir) + archive_links(new_links, overwrite=False, **archive_kwargs) return all_links diff --git a/tests/test_add.py b/tests/test_add.py index 5e672e8d..bb15e51b 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -81,4 +81,13 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert output_json["history"] != {} \ No newline at end of file + assert output_json["history"] != {} + +def test_extract_input_uses_only_passed_extractors(tmp_path, process): + subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--extract", "wget"], + capture_output=True) + + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + + assert (archived_item_path / "warc").exists() + assert not (archived_item_path / "singlefile.html").exists() \ No newline at end of file