1
0
Fork 0
mirror of synced 2024-06-01 10:09:49 +12:00

Handle list of tags in JSON, and be more clever about comma vs. space

This commit is contained in:
jim winstead 2024-02-28 17:38:49 -08:00
parent 178e676e0f
commit ccabda4c7d
3 changed files with 21 additions and 4 deletions

View file

@ -66,9 +66,14 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
elif link.get('name'): elif link.get('name'):
title = link['name'].strip() title = link['name'].strip()
tags = '' # if we have a list, join it with commas
if link.get('tags'): tags = link.get('tags')
tags = link.get('tags').replace(' ',',') if type(tags) == list:
tags = ','.join(tags)
elif type(tags) == str:
# if there's no comma, assume it was space-separated
if ',' not in tags:
tags = tags.replace(' ', ',')
yield Link( yield Link(
url=htmldecode(url), url=htmldecode(url),

View file

@ -1 +1,6 @@
[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] [
{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"},
{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"},
{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]},
{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}
]

View file

@ -110,12 +110,19 @@ def test_json(tmp_path, process, disable_extractors_dict):
urls = list(map(lambda x: x[0], urls)) urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls assert "http://127.0.0.1:8080/static/example.com.html" in urls
assert "http://127.0.0.1:8080/static/iana.org.html" in urls
assert "http://127.0.0.1:8080/static/shift_jis.html" in urls
assert "http://127.0.0.1:8080/static/title_og_with_html" in urls
# if the following URL appears, we must have fallen back to another parser # if the following URL appears, we must have fallen back to another parser
assert not "http://www.example.com/should-not-exist" in urls assert not "http://www.example.com/should-not-exist" in urls
tags = list(map(lambda x: x[0], tags)) tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags assert "Tag1" in tags
assert "Tag2" in tags assert "Tag2" in tags
assert "Tag3" in tags
assert "Tag4 with Space" in tags
assert "Tag5" in tags
assert "Tag6 with Space" in tags
def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f: with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f: