1
0
Fork 0
mirror of synced 2024-05-15 09:52:30 +12:00

Use feedparser for RSS parsing (#1362)

Fixes #1171
Fixes #870 (probably, would need to test against a Wallabag Atom file to
Fixes #135
Fixes #123
Fixes #106
This commit is contained in:
Nick Sweeting 2024-03-14 01:51:45 -07:00 committed by GitHub
commit 099f7d00fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 158 additions and 50 deletions

View file

@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from time import mktime
from feedparser import parse as feedparser
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types
str_between,
) )
@enforce_types @enforce_types
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links""" """Parse RSS XML-format files into links"""
rss_file.seek(0) rss_file.seek(0)
items = rss_file.read().split('<item>') feed = feedparser(rss_file.read())
items = items[1:] if items else [] for item in feed.entries:
for item in items: url = item.link
# example item: title = item.title
# <item> time = mktime(item.updated_parsed)
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
trailing_removed = item.split('</item>', 1)[0] try:
leading_removed = trailing_removed.split('<item>', 1)[-1].strip() tags = ','.join(map(lambda tag: tag.term, item.tags))
rows = leading_removed.split('\n') except AttributeError:
tags = ''
def get_row(key): if url is None:
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] # Yielding a Link with no URL will
# crash on a URL validation assertion
url = str_between(get_row('link'), '<link>', '</link>') continue
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
yield Link( yield Link(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=None, tags=tags,
sources=[rss_file.name], sources=[rss_file.name],
) )

View file

@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime, timezone from time import mktime
from feedparser import parse as feedparser
from xml.etree import ElementTree
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types
) )
@enforce_types @enforce_types
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links""" """Parse Pinboard RSS feed files into links"""
rss_file.seek(0) rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot() feed = feedparser(rss_file.read())
items = root.findall("{http://purl.org/rss/1.0/}item") for item in feed.entries:
for item in items: url = item.link
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore # title will start with "[priv] " if pin was marked private. useful?
title = item.title
time = mktime(item.updated_parsed)
url = find("{http://purl.org/rss/1.0/}link") # all tags are in one entry.tags with spaces in it. annoying!
tags = find("{http://purl.org/dc/elements/1.1/}subject") try:
title = find("{http://purl.org/rss/1.0/}title") tags = item.tags[0].term.replace(' ', ',')
ts_str = find("{http://purl.org/dc/elements/1.1/}date") except AttributeError:
tags = ''
if url is None: if url is None:
# Yielding a Link with no URL will # Yielding a Link with no URL will
# crash on a URL validation assertion # crash on a URL validation assertion
continue continue
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now(timezone.utc)
yield Link( yield Link(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=htmldecode(tags) or None, tags=htmldecode(tags) or None,
sources=[rss_file.name], sources=[rss_file.name],

View file

@ -15,6 +15,7 @@ dependencies = [
"dateparser>=1.0.0", "dateparser>=1.0.0",
"django-extensions>=3.0.3", "django-extensions>=3.0.3",
"django>=3.1.3,<3.2", "django>=3.1.3,<3.2",
"feedparser>=6.0.11",
"ipython>5.0.0", "ipython>5.0.0",
"mypy-extensions>=0.4.3", "mypy-extensions>=0.4.3",
"python-crontab>=2.5.1", "python-crontab>=2.5.1",

View file

@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<feed
xml:lang="en"
xmlns="http://www.w3.org/2005/Atom"
>
<id>http://www.example.com/</id>
<title>Example of an Atom feed</title>
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
<link rel="alternate" type="text/html" href="http://www.example.com/" />
<author>
<name>Jim Winstead</name>
</author>
<updated>2024-02-26T03:18:26Z</updated>
<entry>
<title>Example</title>
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
<id>tag:example.com,2024-02-25:3319</id>
<updated>2024-02-26T03:18:26Z</updated>
<published>2024-02-25T19:18:25-08:00</published>
<category term="Tag1" scheme="http://example.com/archive" />
<category term="Tag2" scheme="http://example.com/archive" />
<content type="html">This is some &lt;b&gt;content&lt;/b&gt;</content>
</entry>
</feed>

View file

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:admin="http://webns.net/mvcb/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<channel>
<title>Sample Feed</title>
<link>http://example.org/</link>
<description>For documentation only</description>
<dc:language>en-us</dc:language>
<dc:creator>Nobody (nobody@example.org)</dc:creator>
<dc:rights>Public domain</dc:rights>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
<admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/>
<item>
<title>First!</title>
<link>http://127.0.0.1:8080/static/example.com.html</link>
<guid isPermaLink="false">just-an@example.org</guid>
<description>
This has a description.
</description>
<dc:subject>Tag1 Tag2</dc:subject>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<content:encoded><![CDATA[
This has a <b>description</b>.]]>
</content:encoded>
</item>
</channel>
</rss>

View file

@ -148,3 +148,71 @@ def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
tags = list(map(lambda x: x[0], tags)) tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags assert "Tag1" in tags
assert "Tag2" in tags assert "Tag2" in tags
def test_generic_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://purl.org/dc/elements/1.1/" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1 Tag2" in tags
def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags
def test_atom(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.w3.org/2005/Atom" in urls
tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags