From aebc83659d866a36e1f52e335568c1bd6d9dea8d Mon Sep 17 00:00:00 2001 From: Emmanuel Hainry Date: Sun, 18 Oct 2020 11:20:07 +0200 Subject: [PATCH] Add parser for Wallabag Atom feeds --- archivebox/parsers/__init__.py | 2 + archivebox/parsers/wallabag_atom.py | 57 +++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 archivebox/parsers/wallabag_atom.py diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 5d0d5ca5..42b2464e 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -33,6 +33,7 @@ from ..logging_util import TimedProgress, log_source_saved from .pocket_html import parse_pocket_html_export from .pinboard_rss import parse_pinboard_rss_export +from .wallabag_atom import parse_wallabag_atom_export from .shaarli_rss import parse_shaarli_rss_export from .medium_rss import parse_medium_rss_export from .netscape_html import parse_netscape_html_export @@ -43,6 +44,7 @@ from .generic_txt import parse_generic_txt_export PARSERS = ( # Specialized parsers + ('Wallabag ATOM', parse_wallabag_atom_export), ('Pocket HTML', parse_pocket_html_export), ('Pinboard RSS', parse_pinboard_rss_export), ('Shaarli RSS', parse_shaarli_rss_export), diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py new file mode 100644 index 00000000..0d77869f --- /dev/null +++ b/archivebox/parsers/wallabag_atom.py @@ -0,0 +1,57 @@ +__package__ = 'archivebox.parsers' + + +from typing import IO, Iterable +from datetime import datetime + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, + str_between, +) + + +@enforce_types +def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: + """Parse Wallabag Atom files into links""" + + rss_file.seek(0) + entries = rss_file.read().split('')[1:] + for entry in entries: + # example entry: + # + # <![CDATA[Orient Ray vs Mako: Is There Much Difference? - iknowwatches.com]]> + # + # https://iknowwatches.com/orient-ray-vs-mako/ + # wallabag:wallabag.drycat.fr:milosh:entry:14041 + # 2020-10-18T09:14:02+02:00 + # 2020-10-18T09:13:56+02:00 + # + # + # + + trailing_removed = entry.split('', 1)[0] + leading_removed = trailing_removed.strip() + rows = leading_removed.split('\n') + + def get_row(key): + return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] + + title = str_between(get_row('title'), '<![CDATA[', ']]>').strip() + url = str_between(get_row('link rel="via"'), '', '') + ts_str = str_between(get_row('published'), '', '') + time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") + try: + tags = str_between(get_row('category'), 'label="', '" />') + except: + tags = None + + yield Link( + url=htmldecode(url), + timestamp=str(time.timestamp()), + title=htmldecode(title) or None, + tags=tags or '', + sources=[rss_file.name], + )