diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py index 4bd04967..005da688 100644 --- a/archivebox/parsers/generic_rss.py +++ b/archivebox/parsers/generic_rss.py @@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers' from typing import IO, Iterable -from datetime import datetime +from time import mktime +from feedparser import parse as feedparser from ..index.schema import Link from ..util import ( htmldecode, - enforce_types, - str_between, + enforce_types ) @enforce_types @@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: """Parse RSS XML-format files into links""" rss_file.seek(0) - items = rss_file.read().split('') - items = items[1:] if items else [] - for item in items: - # example item: - # - # <![CDATA[How JavaScript works: inside the V8 engine]]> - # Unread - # https://blog.sessionstack.com/how-javascript-works-inside - # https://blog.sessionstack.com/how-javascript-works-inside - # Mon, 21 Aug 2017 14:21:58 -0500 - # + feed = feedparser(rss_file.read()) + for item in feed.entries: + url = item.link + title = item.title + time = mktime(item.updated_parsed) - trailing_removed = item.split('', 1)[0] - leading_removed = trailing_removed.split('', 1)[-1].strip() - rows = leading_removed.split('\n') + try: + tags = ','.join(map(lambda tag: tag.term, item.tags)) + except AttributeError: + tags = '' - def get_row(key): - return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] - - url = str_between(get_row('link'), '', '') - ts_str = str_between(get_row('pubDate'), '', '') - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") - title = str_between(get_row('title'), ' Iterable[Link]: """Parse Pinboard RSS feed files into links""" rss_file.seek(0) - root = ElementTree.parse(rss_file).getroot() - items = root.findall("{http://purl.org/rss/1.0/}item") - for item in items: - find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore + feed = feedparser(rss_file.read()) + for item in feed.entries: + url = item.link + # title will start with "[priv] " if pin was marked private. useful? + title = item.title + time = mktime(item.updated_parsed) - url = find("{http://purl.org/rss/1.0/}link") - tags = find("{http://purl.org/dc/elements/1.1/}subject") - title = find("{http://purl.org/rss/1.0/}title") - ts_str = find("{http://purl.org/dc/elements/1.1/}date") + # all tags are in one entry.tags with spaces in it. annoying! + try: + tags = item.tags[0].term.replace(' ', ',') + except AttributeError: + tags = '' if url is None: # Yielding a Link with no URL will # crash on a URL validation assertion continue - # Pinboard includes a colon in its date stamp timezone offsets, which - # Python can't parse. Remove it: - if ts_str and ts_str[-3:-2] == ":": - ts_str = ts_str[:-3]+ts_str[-2:] - - if ts_str: - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - else: - time = datetime.now(timezone.utc) - yield Link( url=htmldecode(url), - timestamp=str(time.timestamp()), + timestamp=str(time), title=htmldecode(title) or None, tags=htmldecode(tags) or None, sources=[rss_file.name], diff --git a/pyproject.toml b/pyproject.toml index f5f7dc4b..eedea90c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "dateparser>=1.0.0", "django-extensions>=3.0.3", "django>=3.1.3,<3.2", + "feedparser>=6.0.11", "ipython>5.0.0", "mypy-extensions>=0.4.3", "python-crontab>=2.5.1", diff --git a/tests/mock_server/templates/example.atom b/tests/mock_server/templates/example.atom new file mode 100644 index 00000000..9d71abb1 --- /dev/null +++ b/tests/mock_server/templates/example.atom @@ -0,0 +1,24 @@ + + + http://www.example.com/ + Example of an Atom feed + + + + Jim Winstead + + 2024-02-26T03:18:26Z + + Example + + tag:example.com,2024-02-25:3319 + 2024-02-26T03:18:26Z + 2024-02-25T19:18:25-08:00 + + + This is some <b>content</b> + + diff --git a/tests/mock_server/templates/example.rss b/tests/mock_server/templates/example.rss new file mode 100644 index 00000000..d47a5a38 --- /dev/null +++ b/tests/mock_server/templates/example.rss @@ -0,0 +1,32 @@ + + + + Sample Feed + http://example.org/ + For documentation only + en-us + Nobody (nobody@example.org) + Public domain + 2024-02-26T17:28:12-08:00 + + + + + First! + http://127.0.0.1:8080/static/example.com.html + just-an@example.org + + This has a description. + + Tag1 Tag2 + 2024-02-26T17:28:12-08:00 + description.]]> + + + + diff --git a/tests/test_add.py b/tests/test_add.py index dd1307bb..972db2e8 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -148,3 +148,71 @@ def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): tags = list(map(lambda x: x[0], tags)) assert "Tag1" in tags assert "Tag2" in tags + +def test_generic_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://purl.org/dc/elements/1.1/" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1 Tag2" in tags + +def test_pinboard_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=pinboard_rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_atom(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.w3.org/2005/Atom" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags