From 178e676e0f27704b1ead99c554f8a65426bc9ca8 Mon Sep 17 00:00:00 2001
From: jim winstead <jimw@trainedmonkey.com>
Date: Tue, 27 Feb 2024 14:48:19 -0800
Subject: [PATCH 1/3] Fix JSON parser by not always mangling the input

Rather than by assuming the JSON file we are parsing has junk at the beginning
(which maybe only used to happen?), try parsing it as-is first, and then fall
back to trying again after skipping the first line

Fixes #1347
---
 archivebox/parsers/generic_json.py           | 19 ++++++--
 tests/mock_server/templates/example.json     |  1 +
 tests/mock_server/templates/example.json.bad |  2 +
 tests/test_add.py                            | 50 ++++++++++++++++++++
 4 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 tests/mock_server/templates/example.json
 create mode 100644 tests/mock_server/templates/example.json.bad

diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py
index daebb7c4..d8df70c3 100644
--- a/archivebox/parsers/generic_json.py
+++ b/archivebox/parsers/generic_json.py
@@ -18,9 +18,16 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
 
     json_file.seek(0)
 
-    # sometimes the first line is a comment or filepath, so we get everything after the first {
-    json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
-    links = json.loads(json_file_json_str)
+    try:
+        links = json.load(json_file)
+    except json.decoder.JSONDecodeError:
+        # sometimes the first line is a comment or other junk, so try without
+        json_file.seek(0)
+        first_line = json_file.readline()
+        #print('      > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
+        links = json.load(json_file)
+        # we may fail again, which means we really don't know what to do
+
     json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
 
     for link in links:
@@ -59,11 +66,15 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
             elif link.get('name'):
                 title = link['name'].strip()
 
+            tags = ''
+            if link.get('tags'):
+                tags = link.get('tags').replace(' ',',')
+
             yield Link(
                 url=htmldecode(url),
                 timestamp=ts_str,
                 title=htmldecode(title) or None,
-                tags=htmldecode(link.get('tags')) or '',
+                tags=htmldecode(tags),
                 sources=[json_file.name],
             )
 
diff --git a/tests/mock_server/templates/example.json b/tests/mock_server/templates/example.json
new file mode 100644
index 00000000..512febe5
--- /dev/null
+++ b/tests/mock_server/templates/example.json
@@ -0,0 +1 @@
+[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]
diff --git a/tests/mock_server/templates/example.json.bad b/tests/mock_server/templates/example.json.bad
new file mode 100644
index 00000000..88d77757
--- /dev/null
+++ b/tests/mock_server/templates/example.json.bad
@@ -0,0 +1,2 @@
+this line would cause problems but --parser=json will actually skip it
+[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]
diff --git a/tests/test_add.py b/tests/test_add.py
index 331178fe..062de11e 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -91,3 +91,53 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
 
     assert (archived_item_path / "warc").exists()
     assert not (archived_item_path / "singlefile.html").exists()
+
+def test_json(tmp_path, process, disable_extractors_dict):
+    with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f:
+        arg_process = subprocess.run(
+            ["archivebox", "add", "--index-only", "--parser=json"],
+            stdin=f,
+            capture_output=True,
+            env=disable_extractors_dict,
+        )
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    urls = c.execute("SELECT url from core_snapshot").fetchall()
+    tags = c.execute("SELECT name from core_tag").fetchall()
+    conn.commit()
+    conn.close()
+
+    urls = list(map(lambda x: x[0], urls))
+    assert "http://127.0.0.1:8080/static/example.com.html" in urls
+    # if the following URL appears, we must have fallen back to another parser
+    assert not "http://www.example.com/should-not-exist" in urls
+
+    tags = list(map(lambda x: x[0], tags))
+    assert "Tag1" in tags
+    assert "Tag2" in tags
+
+def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
+    with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f:
+        arg_process = subprocess.run(
+            ["archivebox", "add", "--index-only", "--parser=json"],
+            stdin=f,
+            capture_output=True,
+            env=disable_extractors_dict,
+        )
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    urls = c.execute("SELECT url from core_snapshot").fetchall()
+    tags = c.execute("SELECT name from core_tag").fetchall()
+    conn.commit()
+    conn.close()
+
+    urls = list(map(lambda x: x[0], urls))
+    assert "http://127.0.0.1:8080/static/example.com.html" in urls
+    # if the following URL appears, we must have fallen back to another parser
+    assert not "http://www.example.com/should-not-exist" in urls
+
+    tags = list(map(lambda x: x[0], tags))
+    assert "Tag1" in tags
+    assert "Tag2" in tags

From ccabda4c7d17f064feb413e9268b7d0c4f02029f Mon Sep 17 00:00:00 2001
From: jim winstead <jimw@trainedmonkey.com>
Date: Wed, 28 Feb 2024 17:38:49 -0800
Subject: [PATCH 2/3] Handle list of tags in JSON, and be more clever about
 comma vs. space

---
 archivebox/parsers/generic_json.py       | 11 ++++++++---
 tests/mock_server/templates/example.json |  7 ++++++-
 tests/test_add.py                        |  7 +++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py
index d8df70c3..9d12a4ef 100644
--- a/archivebox/parsers/generic_json.py
+++ b/archivebox/parsers/generic_json.py
@@ -66,9 +66,14 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
             elif link.get('name'):
                 title = link['name'].strip()
 
-            tags = ''
-            if link.get('tags'):
-                tags = link.get('tags').replace(' ',',')
+            # if we have a list, join it with commas
+            tags = link.get('tags')
+            if type(tags) == list:
+                tags = ','.join(tags)
+            elif type(tags) == str:
+                # if there's no comma, assume it was space-separated
+                if ',' not in tags:
+                    tags = tags.replace(' ', ',')
 
             yield Link(
                 url=htmldecode(url),
diff --git a/tests/mock_server/templates/example.json b/tests/mock_server/templates/example.json
index 512febe5..6ee15597 100644
--- a/tests/mock_server/templates/example.json
+++ b/tests/mock_server/templates/example.json
@@ -1 +1,6 @@
-[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]
+[
+{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"},
+{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"},
+{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]},
+{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}
+]
diff --git a/tests/test_add.py b/tests/test_add.py
index 062de11e..dd1307bb 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -110,12 +110,19 @@ def test_json(tmp_path, process, disable_extractors_dict):
 
     urls = list(map(lambda x: x[0], urls))
     assert "http://127.0.0.1:8080/static/example.com.html" in urls
+    assert "http://127.0.0.1:8080/static/iana.org.html" in urls
+    assert "http://127.0.0.1:8080/static/shift_jis.html" in urls
+    assert "http://127.0.0.1:8080/static/title_og_with_html" in urls
     # if the following URL appears, we must have fallen back to another parser
     assert not "http://www.example.com/should-not-exist" in urls
 
     tags = list(map(lambda x: x[0], tags))
     assert "Tag1" in tags
     assert "Tag2" in tags
+    assert "Tag3" in tags
+    assert "Tag4 with Space" in tags
+    assert "Tag5" in tags
+    assert "Tag6 with Space" in tags
 
 def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
     with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f:

From fe11e1c2f47487b419497bac38aafbd433ed689a Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Wed, 28 Feb 2024 18:19:44 -0800
Subject: [PATCH 3/3] check if COOKIE_FILE is file

---
 archivebox/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/util.py b/archivebox/util.py
index 2e1e4907..9b570ec9 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -174,7 +174,7 @@ def download_url(url: str, timeout: int=None) -> str:
     timeout = timeout or TIMEOUT
 
     cookie_jar = http.cookiejar.MozillaCookieJar()
-    if COOKIES_FILE is not None:
+    if COOKIES_FILE and Path(COOKIES_FILE).is_file():
         cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True)
     else:
         cookie_jar = None