From ca8f57ef5ca1da75907fbd197a254e3376ab1004 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 4 Feb 2019 21:18:42 -0800 Subject: [PATCH] add dedicated shaarli rss parser --- archivebox/parse.py | 43 +++++++++++++++++++++++++++++++++++++++++++ archivebox/util.py | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/archivebox/parse.py b/archivebox/parse.py index 7314a543..ea9579b7 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -44,6 +44,7 @@ def get_parsers(file): ('bookmarks', parse_bookmarks_export), ('rss', parse_rss_export), ('pinboard_rss', parse_pinboard_rss_feed), + ('shaarli_rss', parse_shaarli_rss_export), ('medium_rss', parse_medium_rss_feed), ('plain_text', parse_plain_text), ]) @@ -167,6 +168,48 @@ def parse_rss_export(rss_file): yield info +def parse_shaarli_rss_export(rss_file): + """Parse Shaarli-specific RSS XML-format files into links""" + + rss_file.seek(0) + entries = rss_file.read().split('')[1:] + for entry in entries: + # example entry: + # + # Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online + # + # https://demo.shaarli.org/?cEV4vw + # 2019-01-30T06:06:01+00:00 + # 2019-01-30T06:06:01+00:00 + #

Permalink

]]>
+ #
+ + trailing_removed = entry.split('
', 1)[0] + leading_removed = trailing_removed.strip() + rows = leading_removed.split('\n') + + def get_row(key): + return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] + + title = str_between(get_row('title'), '', '').strip() + url = str_between(get_row('link'), '') + ts_str = str_between(get_row('published'), '', '') + time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") + + + info = { + 'url': url, + 'domain': domain(url), + 'base_url': base_url(url), + 'timestamp': str(time.timestamp()), + 'tags': '', + 'title': title or fetch_page_title(url), + 'sources': [rss_file.name], + } + info['type'] = get_link_type(info) + + yield info + def parse_bookmarks_export(html_file): """Parse netscape-format bookmarks export files (produced by all browsers)""" diff --git a/archivebox/util.py b/archivebox/util.py index 3eda5103..2a81bda9 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -233,8 +233,8 @@ def fetch_page_title(url, default=True): default = url try: + sys.stdout.write('.') html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8') - match = re.search('(.*?)', html_content) return match.group(1) if match else default or None except Exception: