From 5478d13d5254a2443a3a32645a6bb3118bfa7b8a Mon Sep 17 00:00:00 2001 From: jim winstead Date: Thu, 29 Feb 2024 18:15:06 -0800 Subject: [PATCH] Add generic_jsonl parser Resolves #1369 --- archivebox/parsers/__init__.py | 2 + archivebox/parsers/generic_json.py | 110 +++++++++--------- archivebox/parsers/generic_jsonl.py | 34 ++++++ .../templates/example-single.jsonl | 1 + tests/mock_server/templates/example.jsonl | 4 + tests/test_add.py | 70 +++++++++++ 6 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 archivebox/parsers/generic_jsonl.py create mode 100644 tests/mock_server/templates/example-single.jsonl create mode 100644 tests/mock_server/templates/example.jsonl diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index c6f2f382..0cd39d8a 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -44,6 +44,7 @@ from . import medium_rss from . import netscape_html from . import generic_rss from . import generic_json +from . import generic_jsonl from . import generic_html from . import generic_txt from . import url_list @@ -63,6 +64,7 @@ PARSERS = { netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER), generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER), generic_json.KEY: (generic_json.NAME, generic_json.PARSER), + generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER), generic_html.KEY: (generic_html.NAME, generic_html.PARSER), # Catchall fallback parser diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index 9d12a4ef..8b64f55e 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -11,6 +11,60 @@ from ..util import ( enforce_types, ) +# This gets used by generic_jsonl, too +def jsonObjectToLink(link: str, source: str): + json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') + + # example line + # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] + # Parse URL + url = link.get('href') or link.get('url') or link.get('URL') + if not url: + raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') + + # Parse the timestamp + ts_str = str(datetime.now(timezone.utc).timestamp()) + if link.get('timestamp'): + # chrome/ff histories use a very precise timestamp + ts_str = str(link['timestamp'] / 10000000) + elif link.get('time'): + ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) + elif link.get('created_at'): + ts_str = str(json_date(link['created_at']).timestamp()) + elif link.get('created'): + ts_str = str(json_date(link['created']).timestamp()) + elif link.get('date'): + ts_str = str(json_date(link['date']).timestamp()) + elif link.get('bookmarked'): + ts_str = str(json_date(link['bookmarked']).timestamp()) + elif link.get('saved'): + ts_str = str(json_date(link['saved']).timestamp()) + + # Parse the title + title = None + if link.get('title'): + title = link['title'].strip() + elif link.get('description'): + title = link['description'].replace(' — Readability', '').strip() + elif link.get('name'): + title = link['name'].strip() + + # if we have a list, join it with commas + tags = link.get('tags') + if type(tags) == list: + tags = ','.join(tags) + elif type(tags) == str: + # if there's no comma, assume it was space-separated + if ',' not in tags: + tags = tags.replace(' ', ',') + + return Link( + url=htmldecode(url), + timestamp=ts_str, + title=htmldecode(title) or None, + tags=htmldecode(tags), + sources=[source], + ) @enforce_types def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: @@ -20,6 +74,8 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: try: links = json.load(json_file) + if type(links) != list: + raise Exception('JSON parser expects list of objects, maybe this is JSONL?') except json.decoder.JSONDecodeError: # sometimes the first line is a comment or other junk, so try without json_file.seek(0) @@ -28,61 +84,9 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: links = json.load(json_file) # we may fail again, which means we really don't know what to do - json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') - for link in links: - # example line - # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if link: - # Parse URL - url = link.get('href') or link.get('url') or link.get('URL') - if not url: - raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') - - # Parse the timestamp - ts_str = str(datetime.now(timezone.utc).timestamp()) - if link.get('timestamp'): - # chrome/ff histories use a very precise timestamp - ts_str = str(link['timestamp'] / 10000000) - elif link.get('time'): - ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) - elif link.get('created_at'): - ts_str = str(json_date(link['created_at']).timestamp()) - elif link.get('created'): - ts_str = str(json_date(link['created']).timestamp()) - elif link.get('date'): - ts_str = str(json_date(link['date']).timestamp()) - elif link.get('bookmarked'): - ts_str = str(json_date(link['bookmarked']).timestamp()) - elif link.get('saved'): - ts_str = str(json_date(link['saved']).timestamp()) - - # Parse the title - title = None - if link.get('title'): - title = link['title'].strip() - elif link.get('description'): - title = link['description'].replace(' — Readability', '').strip() - elif link.get('name'): - title = link['name'].strip() - - # if we have a list, join it with commas - tags = link.get('tags') - if type(tags) == list: - tags = ','.join(tags) - elif type(tags) == str: - # if there's no comma, assume it was space-separated - if ',' not in tags: - tags = tags.replace(' ', ',') - - yield Link( - url=htmldecode(url), - timestamp=ts_str, - title=htmldecode(title) or None, - tags=htmldecode(tags), - sources=[json_file.name], - ) - + yield jsonObjectToLink(link,json_file.name) KEY = 'json' NAME = 'Generic JSON' diff --git a/archivebox/parsers/generic_jsonl.py b/archivebox/parsers/generic_jsonl.py new file mode 100644 index 00000000..8ee94b28 --- /dev/null +++ b/archivebox/parsers/generic_jsonl.py @@ -0,0 +1,34 @@ +__package__ = 'archivebox.parsers' + +import json + +from typing import IO, Iterable +from datetime import datetime, timezone + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + +from .generic_json import jsonObjectToLink + +def parse_line(line: str): + if line.strip() != "": + return json.loads(line) + +@enforce_types +def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: + """Parse JSONL format bookmarks export files""" + + json_file.seek(0) + + links = [ parse_line(line) for line in json_file ] + + for link in links: + if link: + yield jsonObjectToLink(link,json_file.name) + +KEY = 'jsonl' +NAME = 'Generic JSONL' +PARSER = parse_generic_jsonl_export diff --git a/tests/mock_server/templates/example-single.jsonl b/tests/mock_server/templates/example-single.jsonl new file mode 100644 index 00000000..492c906d --- /dev/null +++ b/tests/mock_server/templates/example-single.jsonl @@ -0,0 +1 @@ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} diff --git a/tests/mock_server/templates/example.jsonl b/tests/mock_server/templates/example.jsonl new file mode 100644 index 00000000..de0b3b5c --- /dev/null +++ b/tests/mock_server/templates/example.jsonl @@ -0,0 +1,4 @@ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} +{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"} +{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]} +{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} diff --git a/tests/test_add.py b/tests/test_add.py index 972db2e8..c899b320 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -216,3 +216,73 @@ def test_atom(tmp_path, process, disable_extractors_dict): tags = list(map(lambda x: x[0], tags)) assert "Tag1" in tags assert "Tag2" in tags + +def test_jsonl(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=jsonl"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + assert "http://127.0.0.1:8080/static/iana.org.html" in urls + assert "http://127.0.0.1:8080/static/shift_jis.html" in urls + assert "http://127.0.0.1:8080/static/title_og_with_html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + assert "Tag3" in tags + assert "Tag4 with Space" in tags + assert "Tag5" in tags + assert "Tag6 with Space" in tags + +def test_jsonl_single(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=jsonl"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +# make sure that JSON parser rejects a single line of JSONL which is valid +# JSON but not our expected format +def test_json_single(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + assert 'expects list of objects' in arg_process.stderr.decode("utf-8")