diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py
index 4bd04967..005da688 100644
--- a/archivebox/parsers/generic_rss.py
+++ b/archivebox/parsers/generic_rss.py
@@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
-from datetime import datetime
+from time import mktime
+from feedparser import parse as feedparser
from ..index.schema import Link
from ..util import (
htmldecode,
- enforce_types,
- str_between,
+ enforce_types
)
@enforce_types
@@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
- items = rss_file.read().split('- ')
- items = items[1:] if items else []
- for item in items:
- # example item:
- #
-
- #
- # Unread
- # https://blog.sessionstack.com/how-javascript-works-inside
- # https://blog.sessionstack.com/how-javascript-works-inside
- # Mon, 21 Aug 2017 14:21:58 -0500
- #
+ feed = feedparser(rss_file.read())
+ for item in feed.entries:
+ url = item.link
+ title = item.title
+ time = mktime(item.updated_parsed)
- trailing_removed = item.split(' ', 1)[0]
- leading_removed = trailing_removed.split('- ', 1)[-1].strip()
- rows = leading_removed.split('\n')
+ try:
+ tags = ','.join(map(lambda tag: tag.term, item.tags))
+ except AttributeError:
+ tags = ''
- def get_row(key):
- return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
-
- url = str_between(get_row('link'), '', '')
- ts_str = str_between(get_row('pubDate'), '', '')
- time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
- title = str_between(get_row('title'), ' Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
- root = ElementTree.parse(rss_file).getroot()
- items = root.findall("{http://purl.org/rss/1.0/}item")
- for item in items:
- find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
+ feed = feedparser(rss_file.read())
+ for item in feed.entries:
+ url = item.link
+ # title will start with "[priv] " if pin was marked private. useful?
+ title = item.title
+ time = mktime(item.updated_parsed)
- url = find("{http://purl.org/rss/1.0/}link")
- tags = find("{http://purl.org/dc/elements/1.1/}subject")
- title = find("{http://purl.org/rss/1.0/}title")
- ts_str = find("{http://purl.org/dc/elements/1.1/}date")
+ # all tags are in one entry.tags with spaces in it. annoying!
+ try:
+ tags = item.tags[0].term.replace(' ', ',')
+ except AttributeError:
+ tags = ''
if url is None:
# Yielding a Link with no URL will
# crash on a URL validation assertion
continue
- # Pinboard includes a colon in its date stamp timezone offsets, which
- # Python can't parse. Remove it:
- if ts_str and ts_str[-3:-2] == ":":
- ts_str = ts_str[:-3]+ts_str[-2:]
-
- if ts_str:
- time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
- else:
- time = datetime.now(timezone.utc)
-
yield Link(
url=htmldecode(url),
- timestamp=str(time.timestamp()),
+ timestamp=str(time),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],
diff --git a/pyproject.toml b/pyproject.toml
index f5f7dc4b..eedea90c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
"dateparser>=1.0.0",
"django-extensions>=3.0.3",
"django>=3.1.3,<3.2",
+ "feedparser>=6.0.11",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
"python-crontab>=2.5.1",
diff --git a/tests/mock_server/templates/example.atom b/tests/mock_server/templates/example.atom
new file mode 100644
index 00000000..9d71abb1
--- /dev/null
+++ b/tests/mock_server/templates/example.atom
@@ -0,0 +1,24 @@
+
+
+ http://www.example.com/
+ Example of an Atom feed
+
+
+
+ Jim Winstead
+
+ 2024-02-26T03:18:26Z
+
+ Example
+
+ tag:example.com,2024-02-25:3319
+ 2024-02-26T03:18:26Z
+ 2024-02-25T19:18:25-08:00
+
+
+ This is some <b>content</b>
+
+
diff --git a/tests/mock_server/templates/example.rss b/tests/mock_server/templates/example.rss
new file mode 100644
index 00000000..d47a5a38
--- /dev/null
+++ b/tests/mock_server/templates/example.rss
@@ -0,0 +1,32 @@
+
+
+
+ Sample Feed
+ http://example.org/
+ For documentation only
+ en-us
+ Nobody (nobody@example.org)
+ Public domain
+ 2024-02-26T17:28:12-08:00
+
+
+
+
-
+ First!
+ http://127.0.0.1:8080/static/example.com.html
+ just-an@example.org
+
+ This has a description.
+
+ Tag1 Tag2
+ 2024-02-26T17:28:12-08:00
+ description.]]>
+
+
+
+
diff --git a/tests/test_add.py b/tests/test_add.py
index dd1307bb..972db2e8 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -148,3 +148,71 @@ def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
+
+def test_generic_rss(tmp_path, process, disable_extractors_dict):
+ with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
+ arg_process = subprocess.run(
+ ["archivebox", "add", "--index-only", "--parser=rss"],
+ stdin=f,
+ capture_output=True,
+ env=disable_extractors_dict,
+ )
+
+ conn = sqlite3.connect("index.sqlite3")
+ c = conn.cursor()
+ urls = c.execute("SELECT url from core_snapshot").fetchall()
+ tags = c.execute("SELECT name from core_tag").fetchall()
+ conn.commit()
+ conn.close()
+
+ urls = list(map(lambda x: x[0], urls))
+ assert "http://127.0.0.1:8080/static/example.com.html" in urls
+ # if the following URL appears, we must have fallen back to another parser
+ assert not "http://purl.org/dc/elements/1.1/" in urls
+
+ tags = list(map(lambda x: x[0], tags))
+ assert "Tag1 Tag2" in tags
+
+def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
+ with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
+ arg_process = subprocess.run(
+ ["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
+ stdin=f,
+ capture_output=True,
+ env=disable_extractors_dict,
+ )
+
+ conn = sqlite3.connect("index.sqlite3")
+ c = conn.cursor()
+ tags = c.execute("SELECT name from core_tag").fetchall()
+ conn.commit()
+ conn.close()
+
+ tags = list(map(lambda x: x[0], tags))
+ assert "Tag1" in tags
+ assert "Tag2" in tags
+
+def test_atom(tmp_path, process, disable_extractors_dict):
+ with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
+ arg_process = subprocess.run(
+ ["archivebox", "add", "--index-only", "--parser=rss"],
+ stdin=f,
+ capture_output=True,
+ env=disable_extractors_dict,
+ )
+
+ conn = sqlite3.connect("index.sqlite3")
+ c = conn.cursor()
+ urls = c.execute("SELECT url from core_snapshot").fetchall()
+ tags = c.execute("SELECT name from core_tag").fetchall()
+ conn.commit()
+ conn.close()
+
+ urls = list(map(lambda x: x[0], urls))
+ assert "http://127.0.0.1:8080/static/example.com.html" in urls
+ # if the following URL appears, we must have fallen back to another parser
+ assert not "http://www.w3.org/2005/Atom" in urls
+
+ tags = list(map(lambda x: x[0], tags))
+ assert "Tag1" in tags
+ assert "Tag2" in tags