From d5fc13b34e0f29c67b52c05a3ba098f049830e60 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 08:36:58 -0500 Subject: [PATCH 01/10] refactor: Move pytest fixtures to its own file --- tests/__init__.py | 0 tests/fixtures.py | 10 ++++++++++ tests/test_args.py | 0 tests/test_init.py | 9 +-------- 4 files changed, 11 insertions(+), 8 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/fixtures.py create mode 100644 tests/test_args.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 00000000..9bf2640a --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,10 @@ +import os +import subprocess + +import pytest + +@pytest.fixture +def process(tmp_path): + os.chdir(tmp_path) + process = subprocess.run(['archivebox', 'init'], capture_output=True) + return process \ No newline at end of file diff --git a/tests/test_args.py b/tests/test_args.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_init.py b/tests/test_init.py index b870a599..1b80bb1b 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -6,14 +6,7 @@ import subprocess from pathlib import Path import json -import pytest - -@pytest.fixture -def process(tmp_path): - os.chdir(tmp_path) - process = subprocess.run(['archivebox', 'init'], capture_output=True) - return process - +from .fixtures import * def test_init(tmp_path, process): assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8") From 8b22a2a7dd2507e164f0780fa38d73ba36912144 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 09:10:36 -0500 Subject: [PATCH 02/10] feat: Enable --depth flag (still does nothing) --- archivebox/cli/archivebox_add.py | 13 +++++++------ tests/test_args.py | 7 +++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 272fe5cf..77a11bd0 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -45,6 +45,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ' ~/Desktop/sites_list.csv\n' ) ) + parser.add_argument( + "--depth", + action="store", + default=0, + type=int, + help="Recursively archive all linked pages up to this many hops away" + ) command = parser.parse_args(args or ()) import_str = accept_stdin(stdin) add( @@ -63,12 +70,6 @@ if __name__ == '__main__': # TODO: Implement these # # parser.add_argument( -# '--depth', #'-d', -# type=int, -# help='Recursively archive all linked pages up to this many hops away', -# default=0, -# ) -# parser.add_argument( # '--mirror', #'-m', # action='store_true', # help='Archive an entire site (finding all linked pages below it on the same domain)', diff --git a/tests/test_args.py b/tests/test_args.py index e69de29b..b8df1941 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -0,0 +1,7 @@ +import subprocess + +from .fixtures import * + +def test_depth_flag_is_accepted(tmp_path, process): + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) + assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode('utf-8') \ No newline at end of file From 2db03245398f0a6c7fcda77a3ebc5688e3836396 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 09:49:28 -0500 Subject: [PATCH 03/10] feat: depth=0 crawls the current page only --- archivebox/cli/archivebox_add.py | 14 +++++++++++--- tests/test_args.py | 12 ++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 77a11bd0..5bbccb19 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -53,14 +53,22 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) - import_str = accept_stdin(stdin) + #import_str = accept_stdin(stdin) add( - import_str=import_str, - import_path=command.import_path, + import_str=command.import_path, + import_path=None, update_all=command.update_all, index_only=command.index_only, out_dir=pwd or OUTPUT_DIR, ) + #if command.depth == 1: + # add( + # import_str=None, + # import_path=command.import_path, + # update_all=command.update_all, + # index_only=command.index_only, + # out_dir=pwd or OUTPUT_DIR, + # ) if __name__ == '__main__': diff --git a/tests/test_args.py b/tests/test_args.py index b8df1941..59d43fee 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -1,7 +1,15 @@ import subprocess +import json from .fixtures import * -def test_depth_flag_is_accepted(tmp_path, process): +def test_depth_flag_is_accepted(process): arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) - assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode('utf-8') \ No newline at end of file + assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode('utf-8') + +def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + with open(archived_item_path / "index.json", "r") as f: + output_json = json.load(f) + assert output_json["base_url"] == "example.com" \ No newline at end of file From 32e790979e2f37c3615b52e0ed858603abd429a5 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 10:07:44 -0500 Subject: [PATCH 04/10] feat: Enable depth=1 functionality --- archivebox/cli/archivebox_add.py | 16 ++++++++-------- tests/test_args.py | 9 ++++++++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 5bbccb19..65335679 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -61,14 +61,14 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional index_only=command.index_only, out_dir=pwd or OUTPUT_DIR, ) - #if command.depth == 1: - # add( - # import_str=None, - # import_path=command.import_path, - # update_all=command.update_all, - # index_only=command.index_only, - # out_dir=pwd or OUTPUT_DIR, - # ) + if command.depth == 1: + add( + import_str=None, + import_path=command.import_path, + update_all=command.update_all, + index_only=command.index_only, + out_dir=pwd or OUTPUT_DIR, + ) if __name__ == '__main__': diff --git a/tests/test_args.py b/tests/test_args.py index 59d43fee..e0c6020e 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -12,4 +12,11 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert output_json["base_url"] == "example.com" \ No newline at end of file + assert output_json["base_url"] == "example.com" + +def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=1"], capture_output=True) + with open(tmp_path / "index.json", "r") as f: + archive_file = f.read() + assert "https://example.com" in archive_file + assert "https://www.iana.org/domains/example" in archive_file \ No newline at end of file From a6940092bbf37123e68e2c22418584fa9b4a2d88 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 10:25:02 -0500 Subject: [PATCH 05/10] feat: Make sure that depth can only be either 1 or 0 --- archivebox/cli/archivebox_add.py | 2 +- tests/test_args.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 65335679..2f77f754 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -49,11 +49,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional "--depth", action="store", default=0, + choices=[0,1], type=int, help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) - #import_str = accept_stdin(stdin) add( import_str=command.import_path, import_path=None, diff --git a/tests/test_args.py b/tests/test_args.py index e0c6020e..91264ef2 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -5,7 +5,13 @@ from .fixtures import * def test_depth_flag_is_accepted(process): arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) - assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode('utf-8') + assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8") + +def test_depth_flag_fails_if_it_is_not_0_or_1(process): + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=5"], capture_output=True) + assert 'invalid choice' in arg_process.stderr.decode("utf-8") + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=-1"], capture_output=True) + assert 'invalid choice' in arg_process.stderr.decode("utf-8") def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) @@ -19,4 +25,4 @@ def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): with open(tmp_path / "index.json", "r") as f: archive_file = f.read() assert "https://example.com" in archive_file - assert "https://www.iana.org/domains/example" in archive_file \ No newline at end of file + assert "https://www.iana.org/domains/example" in archive_file From bca6a06f6035e7a10c9726ef40e7aed4b4b7ee34 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 11:53:02 -0500 Subject: [PATCH 06/10] test: Fix test to reflect new API changes --- tests/test_init.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_init.py b/tests/test_init.py index 1b80bb1b..c5627a2f 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -25,9 +25,9 @@ def test_add_link(tmp_path, process): with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert "IANA — IANA-managed Reserved Domains" == output_json['history']['title'][0]['output'] + assert "Example Domain" == output_json['history']['title'][0]['output'] with open(tmp_path / "index.html", "r") as f: output_html = f.read() - assert "IANA — IANA-managed Reserved Domains" in output_html + assert "Example Domain" in output_html From b68c13918f28246e8521080a03486dcbb7ff8537 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 12:39:36 -0500 Subject: [PATCH 07/10] feat: Disable stdin from archivebox add --- archivebox/cli/archivebox_add.py | 6 ++++-- archivebox/main.py | 3 +-- tests/test_init.py | 6 ++++++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 2f77f754..c729e9fb 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -10,7 +10,7 @@ from typing import List, Optional, IO from ..main import add, docstring from ..config import OUTPUT_DIR, ONLY_NEW -from .logging import SmartFormatter, accept_stdin +from .logging import SmartFormatter, reject_stdin @docstring(add.__doc__) @@ -38,9 +38,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional type=str, default=None, help=( - 'URL or path to local file containing a list of links to import. e.g.:\n' + 'URL or path to local file containing a page or list of links to import. e.g.:\n' ' https://getpocket.com/users/USERNAME/feed/all\n' ' https://example.com/some/rss/feed.xml\n' + ' https://example.com\n' ' ~/Downloads/firefox_bookmarks_export.html\n' ' ~/Desktop/sites_list.csv\n' ) @@ -54,6 +55,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) + reject_stdin(__command__, stdin) add( import_str=command.import_path, import_path=None, diff --git a/archivebox/main.py b/archivebox/main.py index f1fb98ce..3f05a385 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -507,8 +507,7 @@ def add(import_str: Optional[str]=None, if (import_str and import_path) or (not import_str and not import_path): stderr( - '[X] You should pass either an import path as an argument, ' - 'or pass a list of links via stdin, but not both.\n', + '[X] You should pass an import path or a page url as an argument\n', color='red', ) raise SystemExit(2) diff --git a/tests/test_init.py b/tests/test_init.py index c5627a2f..d592b0a1 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -31,3 +31,9 @@ def test_add_link(tmp_path, process): output_html = f.read() assert "Example Domain" in output_html +def test_add_link_does_not_support_stdin(tmp_path, process): + os.chdir(tmp_path) + stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output = stdin_process.communicate(input="example.com".encode())[0] + assert "does not accept stdin" in output.decode("utf-8") + From c1d8a74e4f2673047e31b96aa303fbd300dccc50 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 15:46:45 -0500 Subject: [PATCH 08/10] feat: Make input sent via stdin behave the same as using args --- archivebox/cli/archivebox_add.py | 19 +++++++++++++++---- tests/test_init.py | 12 +++++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index c729e9fb..c692750b 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -10,7 +10,7 @@ from typing import List, Optional, IO from ..main import add, docstring from ..config import OUTPUT_DIR, ONLY_NEW -from .logging import SmartFormatter, reject_stdin +from .logging import SmartFormatter, accept_stdin @docstring(add.__doc__) @@ -55,9 +55,20 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) + import_string = accept_stdin(stdin) + if import_string and command.import_path: + stderr( + '[X] You should pass an import path or a page url as an argument or in stdin but not both\n', + color='red', + ) + raise SystemExit(2) + elif import_string: + import_path = import_string + else: + import_path = command.import_path + add( - import_str=command.import_path, + import_str=import_path, import_path=None, update_all=command.update_all, index_only=command.index_only, @@ -66,7 +77,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional if command.depth == 1: add( import_str=None, - import_path=command.import_path, + import_path=import_path, update_all=command.update_all, index_only=command.index_only, out_dir=pwd or OUTPUT_DIR, diff --git a/tests/test_init.py b/tests/test_init.py index d592b0a1..97870459 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -31,9 +31,15 @@ def test_add_link(tmp_path, process): output_html = f.read() assert "Example Domain" in output_html -def test_add_link_does_not_support_stdin(tmp_path, process): +def test_add_link_support_stdin(tmp_path, process): os.chdir(tmp_path) stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - output = stdin_process.communicate(input="example.com".encode())[0] - assert "does not accept stdin" in output.decode("utf-8") + stdin_process.communicate(input="http://example.com".encode()) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + + assert "index.json" in [x.name for x in archived_item_path.iterdir()] + + with open(archived_item_path / "index.json", "r") as f: + output_json = json.load(f) + assert "Example Domain" == output_json['history']['title'][0]['output'] From f12bfeb3229345b2d4cd7c1670ba050ca1111e7c Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 8 Jul 2020 08:17:47 -0500 Subject: [PATCH 09/10] refactor: Change add() to receive url and depth instead of import_str and import_path --- archivebox/cli/archivebox_add.py | 12 ++---------- archivebox/core/views.py | 8 +++----- archivebox/main.py | 25 ++++++++++--------------- 3 files changed, 15 insertions(+), 30 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index c692750b..8f491d42 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -68,20 +68,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional import_path = command.import_path add( - import_str=import_path, - import_path=None, + url=import_path, + depth=command.depth, update_all=command.update_all, index_only=command.index_only, out_dir=pwd or OUTPUT_DIR, ) - if command.depth == 1: - add( - import_str=None, - import_path=import_path, - update_all=command.update_all, - index_only=command.index_only, - out_dir=pwd or OUTPUT_DIR, - ) if __name__ == '__main__': diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 0c5efff2..a721b992 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -66,12 +66,10 @@ class AddLinks(View): if form.is_valid(): url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') - if form.cleaned_data["source"] == "url": - key = "import_str" - else: - key = "import_path" + depth = 0 if form.cleaned_data["source"] == "url" else 1 input_kwargs = { - key: url, + "url": url, + "depth": depth, "update_all": False, "out_dir": OUTPUT_DIR, } diff --git a/archivebox/main.py b/archivebox/main.py index 3f05a385..a96c4250 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -496,8 +496,8 @@ def status(out_dir: str=OUTPUT_DIR) -> None: @enforce_types -def add(import_str: Optional[str]=None, - import_path: Optional[str]=None, +def add(url: str, + depth: int=0, update_all: bool=not ONLY_NEW, index_only: bool=False, out_dir: str=OUTPUT_DIR) -> List[Link]: @@ -505,17 +505,9 @@ def add(import_str: Optional[str]=None, check_data_folder(out_dir=out_dir) - if (import_str and import_path) or (not import_str and not import_path): - stderr( - '[X] You should pass an import path or a page url as an argument\n', - color='red', - ) - raise SystemExit(2) - elif import_str: - import_path = save_stdin_to_sources(import_str, out_dir=out_dir) - elif import_path: - import_path = save_file_to_sources(import_path, out_dir=out_dir) - + base_path = save_stdin_to_sources(url, out_dir=out_dir) + if depth == 1: + depth_path = save_file_to_sources(url, out_dir=out_dir) check_dependencies() # Step 1: Load list of links from the existing index @@ -523,8 +515,11 @@ def add(import_str: Optional[str]=None, all_links: List[Link] = [] new_links: List[Link] = [] all_links = load_main_index(out_dir=out_dir) - if import_path: - all_links, new_links = import_new_links(all_links, import_path, out_dir=out_dir) + all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir) + if depth == 1: + all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir) + new_links = new_links + new_links_depth + # Step 2: Write updated index with deduped old and new links back to disk write_main_index(links=all_links, out_dir=out_dir) From 4ebf929606b50afcce94f2440a7ac363cc96a887 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 8 Jul 2020 08:30:07 -0500 Subject: [PATCH 10/10] refactor: Change wording on CLI help --- archivebox/cli/archivebox_add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 8f491d42..c4c78399 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -38,7 +38,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional type=str, default=None, help=( - 'URL or path to local file containing a page or list of links to import. e.g.:\n' + 'URL or path to local file to start the archiving process from. e.g.:\n' ' https://getpocket.com/users/USERNAME/feed/all\n' ' https://example.com/some/rss/feed.xml\n' ' https://example.com\n'