1
0
Fork 0
mirror of synced 2024-06-02 18:44:59 +12:00

handle new wallabag export format with newlines mid-tag attributes

This commit is contained in:
Nick Sweeting 2022-05-09 19:07:42 -07:00
parent 808ae1a351
commit acd53c854d

View file

@ -34,13 +34,19 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
trailing_removed = entry.split('</entry>', 1)[0] trailing_removed = entry.split('</entry>', 1)[0]
leading_removed = trailing_removed.strip() leading_removed = trailing_removed.strip()
rows = leading_removed.split('\n') splits_fixed = leading_removed.replace('"\n href="', '" href="')
rows = splits_fixed.split('\n')
def get_row(key): def get_row(prefix):
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] return [
row.strip()
for row in rows
if row.strip().startswith('<{}'.format(prefix))
][0]
title = str_between(get_row('title'), '<title><![CDATA[', ']]></title>').strip() title = str_between(get_row('title'), '<title><![CDATA[', ']]></title>').strip()
url = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>') url_inside_link = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
url_inside_attr = str_between(get_row('link rel="via"'), 'href="', '"/>')
ts_str = str_between(get_row('published'), '<published>', '</published>') ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
try: try:
@ -49,7 +55,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
tags = None tags = None
yield Link( yield Link(
url=htmldecode(url), url=htmldecode(url_inside_attr or url_inside_link),
timestamp=str(time.timestamp()), timestamp=str(time.timestamp()),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=tags or '', tags=tags or '',