Use feedparser for RSS parsing (#1362)

Fixes #1171 Fixes #870 (probably, would need to test against a Wallabag Atom file to Fixes #135 Fixes #123 Fixes #106
2024-04-29 18:14:38 +12:00 · 2024-03-14 01:51:45 -07:00 · 2024-03-14 01:51:45 -07:00 · 099f7d00fe
parent 3512dc7e60 0f402df42f
commit 099f7d00fe
6 changed files with 158 additions and 50 deletions
--- a/archivebox/parsers/generic_rss.py
+++ b/archivebox/parsers/generic_rss.py
@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'


 from typing import IO, Iterable
-from datetime import datetime
+from time import mktime
+from feedparser import parse as feedparser

 from ..index.schema import Link
 from ..util import (
    htmldecode,
-    enforce_types,
-    str_between,
+    enforce_types
 )

@enforce_types
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse RSS XML-format files into links"""

    rss_file.seek(0)
-    items = rss_file.read().split('<item>')
-    items = items[1:] if items else []
-    for item in items:
-        # example item:
-        # <item>
-        # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
-        # <category>Unread</category>
-        # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
-        # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
-        # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
-        # </item>
+    feed = feedparser(rss_file.read())
+    for item in feed.entries:
+        url = item.link
+        title = item.title
+        time = mktime(item.updated_parsed)

-        trailing_removed = item.split('</item>', 1)[0]
-        leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
-        rows = leading_removed.split('\n')
+        try:
+            tags = ','.join(map(lambda tag: tag.term, item.tags))
+        except AttributeError:
+            tags = ''

-        def get_row(key):
-            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
-
-        url = str_between(get_row('link'), '<link>', '</link>')
-        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
-        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
-        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
+        if url is None:
+            # Yielding a Link with no URL will
+            # crash on a URL validation assertion
+            continue

        yield Link(
            url=htmldecode(url),
-            timestamp=str(time.timestamp()),
+            timestamp=str(time),
            title=htmldecode(title) or None,
-            tags=None,
+            tags=tags,
            sources=[rss_file.name],
        )

--- a/archivebox/parsers/pinboard_rss.py
+++ b/archivebox/parsers/pinboard_rss.py
@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'


 from typing import IO, Iterable
-from datetime import datetime, timezone
-
-from xml.etree import ElementTree
+from time import mktime
+from feedparser import parse as feedparser

 from ..index.schema import Link
 from ..util import (
    htmldecode,
-    enforce_types,
+    enforce_types
 )

-
@enforce_types
 def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse Pinboard RSS feed files into links"""

    rss_file.seek(0)
-    root = ElementTree.parse(rss_file).getroot()
-    items = root.findall("{http://purl.org/rss/1.0/}item")
-    for item in items:
-        find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None    # type: ignore
+    feed = feedparser(rss_file.read())
+    for item in feed.entries:
+        url = item.link
+        # title will start with "[priv] " if pin was marked private. useful?
+        title = item.title
+        time = mktime(item.updated_parsed)

-        url = find("{http://purl.org/rss/1.0/}link")
-        tags = find("{http://purl.org/dc/elements/1.1/}subject")
-        title = find("{http://purl.org/rss/1.0/}title")
-        ts_str = find("{http://purl.org/dc/elements/1.1/}date")
+        # all tags are in one entry.tags with spaces in it. annoying!
+        try:
+            tags = item.tags[0].term.replace(' ', ',')
+        except AttributeError:
+            tags = ''
        
        if url is None:
            # Yielding a Link with no URL will
            # crash on a URL validation assertion
            continue

-        # Pinboard includes a colon in its date stamp timezone offsets, which
-        # Python can't parse. Remove it:
-        if ts_str and ts_str[-3:-2] == ":":
-            ts_str = ts_str[:-3]+ts_str[-2:]
-
-        if ts_str:
-            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
-        else:
-            time = datetime.now(timezone.utc)
-
        yield Link(
            url=htmldecode(url),
-            timestamp=str(time.timestamp()),
+            timestamp=str(time),
            title=htmldecode(title) or None,
            tags=htmldecode(tags) or None,
            sources=[rss_file.name],
--- a/pyproject.toml
+++ b/pyproject.toml
@ -15,6 +15,7 @@ dependencies = [
    "dateparser>=1.0.0",
    "django-extensions>=3.0.3",
    "django>=3.1.3,<3.2",
+    "feedparser>=6.0.11",
    "ipython>5.0.0",
    "mypy-extensions>=0.4.3",
    "python-crontab>=2.5.1",
--- a/tests/mock_server/templates/example.atom
+++ b/tests/mock_server/templates/example.atom
@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed
+ xml:lang="en"
+ xmlns="http://www.w3.org/2005/Atom"
+>
+ <id>http://www.example.com/</id>
+ <title>Example of an Atom feed</title>
+ <link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
+ <link rel="alternate" type="text/html" href="http://www.example.com/" />
+ <author>
+  <name>Jim Winstead</name>
+ </author>
+ <updated>2024-02-26T03:18:26Z</updated>
+ <entry>
+  <title>Example</title>
+  <link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
+  <id>tag:example.com,2024-02-25:3319</id>
+  <updated>2024-02-26T03:18:26Z</updated>
+  <published>2024-02-25T19:18:25-08:00</published>
+  <category term="Tag1" scheme="http://example.com/archive" />
+  <category term="Tag2" scheme="http://example.com/archive" />
+  <content type="html">This is some &lt;b&gt;content&lt;/b&gt;</content>
+ </entry>
+</feed>
--- a/tests/mock_server/templates/example.rss
+++ b/tests/mock_server/templates/example.rss
@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="utf-8"?>
+<rss version="2.0"
+     xmlns:dc="http://purl.org/dc/elements/1.1/"
+     xmlns:admin="http://webns.net/mvcb/"
+     xmlns:content="http://purl.org/rss/1.0/modules/content/"
+     xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+<channel>
+  <title>Sample Feed</title>
+  <link>http://example.org/</link>
+  <description>For documentation only</description>
+  <dc:language>en-us</dc:language>
+  <dc:creator>Nobody (nobody@example.org)</dc:creator>
+  <dc:rights>Public domain</dc:rights>
+  <dc:date>2024-02-26T17:28:12-08:00</dc:date>
+  <admin:generatorAgent rdf:resource="http://www.example.org/"/>
+  <admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>
+
+  <item>
+    <title>First!</title>
+    <link>http://127.0.0.1:8080/static/example.com.html</link>
+    <guid isPermaLink="false">just-an@example.org</guid>
+    <description>
+      This has a description.
+    </description>
+    <dc:subject>Tag1 Tag2</dc:subject>
+    <dc:date>2024-02-26T17:28:12-08:00</dc:date>
+    <content:encoded><![CDATA[
+      This has a <b>description</b>.]]>
+    </content:encoded>
+  </item>
+</channel>
+</rss>
--- a/tests/test_add.py
+++ b/tests/test_add.py
@ -148,3 +148,71 @@ def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
    tags = list(map(lambda x: x[0], tags))
    assert "Tag1" in tags
    assert "Tag2" in tags
+
+def test_generic_rss(tmp_path, process, disable_extractors_dict):
+    with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
+        arg_process = subprocess.run(
+            ["archivebox", "add", "--index-only", "--parser=rss"],
+            stdin=f,
+            capture_output=True,
+            env=disable_extractors_dict,
+        )
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    urls = c.execute("SELECT url from core_snapshot").fetchall()
+    tags = c.execute("SELECT name from core_tag").fetchall()
+    conn.commit()
+    conn.close()
+
+    urls = list(map(lambda x: x[0], urls))
+    assert "http://127.0.0.1:8080/static/example.com.html" in urls
+    # if the following URL appears, we must have fallen back to another parser
+    assert not "http://purl.org/dc/elements/1.1/" in urls
+
+    tags = list(map(lambda x: x[0], tags))
+    assert "Tag1 Tag2" in tags
+
+def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
+    with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
+        arg_process = subprocess.run(
+            ["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
+            stdin=f,
+            capture_output=True,
+            env=disable_extractors_dict,
+        )
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    tags = c.execute("SELECT name from core_tag").fetchall()
+    conn.commit()
+    conn.close()
+
+    tags = list(map(lambda x: x[0], tags))
+    assert "Tag1" in tags
+    assert "Tag2" in tags
+
+def test_atom(tmp_path, process, disable_extractors_dict):
+    with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
+        arg_process = subprocess.run(
+            ["archivebox", "add", "--index-only", "--parser=rss"],
+            stdin=f,
+            capture_output=True,
+            env=disable_extractors_dict,
+        )
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    urls = c.execute("SELECT url from core_snapshot").fetchall()
+    tags = c.execute("SELECT name from core_tag").fetchall()
+    conn.commit()
+    conn.close()
+
+    urls = list(map(lambda x: x[0], urls))
+    assert "http://127.0.0.1:8080/static/example.com.html" in urls
+    # if the following URL appears, we must have fallen back to another parser
+    assert not "http://www.w3.org/2005/Atom" in urls
+
+    tags = list(map(lambda x: x[0], tags))
+    assert "Tag1" in tags
+    assert "Tag2" in tags