# coding: utf-8
"""
Everything related to parsing links from bookmark services.
For a list of supported services, see the README.md.
For examples of supported files see examples/.
Parsed link schema: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'domain': 'example.com',
'base_url': 'example.com/example/',
'timestamp': '15442123124234',
'tags': 'abc,def',
'title': 'Example.com Page Title',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
}
"""
import re
import sys
import json
import urllib
from collections import OrderedDict
import xml.etree.ElementTree as etree
from datetime import datetime
from config import ANSI, SHOW_PROGRESS
from util import (
domain,
base_url,
str_between,
get_link_type,
fetch_page_title,
URL_REGEX,
)
def get_parsers(file):
"""return all parsers that work on a given file, defaults to all of them"""
return OrderedDict([
('Pocket HTML', parse_pocket_html_export),
('Pinboard JSON', parse_pinboard_json_export),
('Netscape HTML', parse_netscape_html_export),
('RSS', parse_rss_export),
('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export),
('Plain Text', parse_plain_text_export),
])
def parse_links(path):
"""parse a list of links dictionaries from a bookmark export file"""
links = []
with open(path, 'r', encoding='utf-8') as file:
print('{green}[*] [{}] Parsing new links from output/sources/{} and fetching titles...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
path.rsplit('/', 1)[-1],
**ANSI,
))
if SHOW_PROGRESS:
sys.stdout.write(' ')
for parser_name, parser_func in get_parsers(file).items():
# otherwise try all parsers until one works
try:
links += list(parser_func(file))
if links:
break
except Exception as err:
# we try each parser one by one, each parser will throw exeption an exception if unsupported
# so we accept the first one that
# uncomment the following line to see why the parser was unsupported for each attempted format
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
pass
return links, parser_name
def parse_pocket_html_export(html_file):
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
pattern = re.compile("^\\s*