1
0
Fork 0
mirror of synced 2024-05-15 18:02:46 +12:00

Use feedparser for RSS parsing in generic_rss and pinboard_rss parsers

The feedparser packages has 20 years of history and is very good at parsing
RSS and Atom, so use that instead of ad-hoc regex and XML parsing.

The medium_rss and shaarli_rss parsers weren't touched because they are
probably unnecessary. (The special parse for pinboard is just needing because
of how tags work.)

Doesn't include tests because I haven't figured out how to run them in the
docker development setup.

Fixes #1171
This commit is contained in:
jim winstead 2024-02-25 12:34:51 -08:00
parent 7b042c854a
commit 9f462a87a8
3 changed files with 34 additions and 50 deletions

View file

@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from time import mktime
from feedparser import parse as feedparser
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types
str_between,
) )
@enforce_types @enforce_types
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse RSS XML-format files into links""" """Parse RSS XML-format files into links"""
rss_file.seek(0) rss_file.seek(0)
items = rss_file.read().split('<item>') feed = feedparser(rss_file.read())
items = items[1:] if items else [] for item in feed.entries:
for item in items: url = item.link
# example item: title = item.title
# <item> time = mktime(item.updated_parsed)
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
trailing_removed = item.split('</item>', 1)[0] try:
leading_removed = trailing_removed.split('<item>', 1)[-1].strip() tags = ','.join(map(lambda tag: tag.term, item.tags))
rows = leading_removed.split('\n') except AttributeError:
tags = ''
def get_row(key): if url is None:
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] # Yielding a Link with no URL will
# crash on a URL validation assertion
url = str_between(get_row('link'), '<link>', '</link>') continue
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
yield Link( yield Link(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=None, tags=tags,
sources=[rss_file.name], sources=[rss_file.name],
) )

View file

@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime, timezone from time import mktime
from feedparser import parse as feedparser
from xml.etree import ElementTree
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types
) )
@enforce_types @enforce_types
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links""" """Parse Pinboard RSS feed files into links"""
rss_file.seek(0) rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot() feed = feedparser(rss_file.read())
items = root.findall("{http://purl.org/rss/1.0/}item") for item in feed.entries:
for item in items: url = item.link
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore # title will start with "[priv] " if pin was marked private. useful?
title = item.title
time = mktime(item.updated_parsed)
url = find("{http://purl.org/rss/1.0/}link") # all tags are in one entry.tags with spaces in it. annoying!
tags = find("{http://purl.org/dc/elements/1.1/}subject") try:
title = find("{http://purl.org/rss/1.0/}title") tags = item.tags[0].term.replace(' ', ',')
ts_str = find("{http://purl.org/dc/elements/1.1/}date") except AttributeError:
tags = ''
if url is None: if url is None:
# Yielding a Link with no URL will # Yielding a Link with no URL will
# crash on a URL validation assertion # crash on a URL validation assertion
continue continue
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now(timezone.utc)
yield Link( yield Link(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=htmldecode(tags) or None, tags=htmldecode(tags) or None,
sources=[rss_file.name], sources=[rss_file.name],

View file

@ -15,6 +15,7 @@ dependencies = [
"dateparser>=1.0.0", "dateparser>=1.0.0",
"django-extensions>=3.0.3", "django-extensions>=3.0.3",
"django>=3.1.3,<3.2", "django>=3.1.3,<3.2",
"feedparser>=6.0.11",
"ipython>5.0.0", "ipython>5.0.0",
"mypy-extensions>=0.4.3", "mypy-extensions>=0.4.3",
"python-crontab>=2.5.1", "python-crontab>=2.5.1",