From 178e676e0f27704b1ead99c554f8a65426bc9ca8 Mon Sep 17 00:00:00 2001 From: jim winstead Date: Tue, 27 Feb 2024 14:48:19 -0800 Subject: [PATCH 1/3] Fix JSON parser by not always mangling the input Rather than by assuming the JSON file we are parsing has junk at the beginning (which maybe only used to happen?), try parsing it as-is first, and then fall back to trying again after skipping the first line Fixes #1347 --- archivebox/parsers/generic_json.py | 19 ++++++-- tests/mock_server/templates/example.json | 1 + tests/mock_server/templates/example.json.bad | 2 + tests/test_add.py | 50 ++++++++++++++++++++ 4 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 tests/mock_server/templates/example.json create mode 100644 tests/mock_server/templates/example.json.bad diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index daebb7c4..d8df70c3 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -18,9 +18,16 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: json_file.seek(0) - # sometimes the first line is a comment or filepath, so we get everything after the first { - json_file_json_str = '{' + json_file.read().split('{', 1)[-1] - links = json.loads(json_file_json_str) + try: + links = json.load(json_file) + except json.decoder.JSONDecodeError: + # sometimes the first line is a comment or other junk, so try without + json_file.seek(0) + first_line = json_file.readline() + #print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '') + links = json.load(json_file) + # we may fail again, which means we really don't know what to do + json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') for link in links: @@ -59,11 +66,15 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: elif link.get('name'): title = link['name'].strip() + tags = '' + if link.get('tags'): + tags = link.get('tags').replace(' ',',') + yield Link( url=htmldecode(url), timestamp=ts_str, title=htmldecode(title) or None, - tags=htmldecode(link.get('tags')) or '', + tags=htmldecode(tags), sources=[json_file.name], ) diff --git a/tests/mock_server/templates/example.json b/tests/mock_server/templates/example.json new file mode 100644 index 00000000..512febe5 --- /dev/null +++ b/tests/mock_server/templates/example.json @@ -0,0 +1 @@ +[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] diff --git a/tests/mock_server/templates/example.json.bad b/tests/mock_server/templates/example.json.bad new file mode 100644 index 00000000..88d77757 --- /dev/null +++ b/tests/mock_server/templates/example.json.bad @@ -0,0 +1,2 @@ +this line would cause problems but --parser=json will actually skip it +[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] diff --git a/tests/test_add.py b/tests/test_add.py index 331178fe..062de11e 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -91,3 +91,53 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process): assert (archived_item_path / "warc").exists() assert not (archived_item_path / "singlefile.html").exists() + +def test_json(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags From ccabda4c7d17f064feb413e9268b7d0c4f02029f Mon Sep 17 00:00:00 2001 From: jim winstead Date: Wed, 28 Feb 2024 17:38:49 -0800 Subject: [PATCH 2/3] Handle list of tags in JSON, and be more clever about comma vs. space --- archivebox/parsers/generic_json.py | 11 ++++++++--- tests/mock_server/templates/example.json | 7 ++++++- tests/test_add.py | 7 +++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index d8df70c3..9d12a4ef 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -66,9 +66,14 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: elif link.get('name'): title = link['name'].strip() - tags = '' - if link.get('tags'): - tags = link.get('tags').replace(' ',',') + # if we have a list, join it with commas + tags = link.get('tags') + if type(tags) == list: + tags = ','.join(tags) + elif type(tags) == str: + # if there's no comma, assume it was space-separated + if ',' not in tags: + tags = tags.replace(' ', ',') yield Link( url=htmldecode(url), diff --git a/tests/mock_server/templates/example.json b/tests/mock_server/templates/example.json index 512febe5..6ee15597 100644 --- a/tests/mock_server/templates/example.json +++ b/tests/mock_server/templates/example.json @@ -1 +1,6 @@ -[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] +[ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}, +{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"}, +{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]}, +{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} +] diff --git a/tests/test_add.py b/tests/test_add.py index 062de11e..dd1307bb 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -110,12 +110,19 @@ def test_json(tmp_path, process, disable_extractors_dict): urls = list(map(lambda x: x[0], urls)) assert "http://127.0.0.1:8080/static/example.com.html" in urls + assert "http://127.0.0.1:8080/static/iana.org.html" in urls + assert "http://127.0.0.1:8080/static/shift_jis.html" in urls + assert "http://127.0.0.1:8080/static/title_og_with_html" in urls # if the following URL appears, we must have fallen back to another parser assert not "http://www.example.com/should-not-exist" in urls tags = list(map(lambda x: x[0], tags)) assert "Tag1" in tags assert "Tag2" in tags + assert "Tag3" in tags + assert "Tag4 with Space" in tags + assert "Tag5" in tags + assert "Tag6 with Space" in tags def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f: From fe11e1c2f47487b419497bac38aafbd433ed689a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 28 Feb 2024 18:19:44 -0800 Subject: [PATCH 3/3] check if COOKIE_FILE is file --- archivebox/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 2e1e4907..9b570ec9 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -174,7 +174,7 @@ def download_url(url: str, timeout: int=None) -> str: timeout = timeout or TIMEOUT cookie_jar = http.cookiejar.MozillaCookieJar() - if COOKIES_FILE is not None: + if COOKIES_FILE and Path(COOKIES_FILE).is_file(): cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True) else: cookie_jar = None