diff --git a/archivebox/parse.py b/archivebox/parse.py
index 7314a543..ea9579b7 100644
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@@ -44,6 +44,7 @@ def get_parsers(file):
('bookmarks', parse_bookmarks_export),
('rss', parse_rss_export),
('pinboard_rss', parse_pinboard_rss_feed),
+ ('shaarli_rss', parse_shaarli_rss_export),
('medium_rss', parse_medium_rss_feed),
('plain_text', parse_plain_text),
])
@@ -167,6 +168,48 @@ def parse_rss_export(rss_file):
yield info
+def parse_shaarli_rss_export(rss_file):
+ """Parse Shaarli-specific RSS XML-format files into links"""
+
+ rss_file.seek(0)
+ entries = rss_file.read().split('')[1:]
+ for entry in entries:
+ # example entry:
+ #
+ # Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online
+ #
+ # https://demo.shaarli.org/?cEV4vw
+ # 2019-01-30T06:06:01+00:00
+ # 2019-01-30T06:06:01+00:00
+ # — Permalink
]]>
+ #
+
+ trailing_removed = entry.split('', 1)[0]
+ leading_removed = trailing_removed.strip()
+ rows = leading_removed.split('\n')
+
+ def get_row(key):
+ return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
+
+ title = str_between(get_row('title'), '
', '').strip()
+ url = str_between(get_row('link'), '')
+ ts_str = str_between(get_row('published'), '', '')
+ time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
+
+
+ info = {
+ 'url': url,
+ 'domain': domain(url),
+ 'base_url': base_url(url),
+ 'timestamp': str(time.timestamp()),
+ 'tags': '',
+ 'title': title or fetch_page_title(url),
+ 'sources': [rss_file.name],
+ }
+ info['type'] = get_link_type(info)
+
+ yield info
+
def parse_bookmarks_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
diff --git a/archivebox/util.py b/archivebox/util.py
index 3eda5103..2a81bda9 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -233,8 +233,8 @@ def fetch_page_title(url, default=True):
default = url
try:
+ sys.stdout.write('.')
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
-
match = re.search('(.*?)', html_content)
return match.group(1) if match else default or None
except Exception: