1
0
Fork 0
mirror of synced 2024-06-28 02:50:24 +12:00

readability url and title fixes

This commit is contained in:
Nick Sweeting 2017-05-22 10:40:16 -05:00 committed by GitHub
parent 843d989382
commit 92f7b399ca

View file

@ -35,14 +35,16 @@ def parse_pocket_export(html):
for line in html: for line in html:
match = pattern.search(line) match = pattern.search(line)
if match: if match:
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
without_scheme = fixed_url.replace('http://', '').replace('https://', '')
yield { yield {
'url': match.group(1).replace('http://www.readability.com/read?url=', ''), 'url': fixed_url,
'domain': match.group(1).replace('http://', '').replace('https://', '').split('/')[0], 'domain': without_scheme.split('/')[0], # without pathname
'base_url': match.group(1).replace('https://', '').replace('http://', '').split('?')[0], 'base_url': without_scheme.split('?')[0], # without query args
'time': datetime.fromtimestamp(int(match.group(2))), 'time': datetime.fromtimestamp(int(match.group(2))),
'timestamp': match.group(2), 'timestamp': match.group(2),
'tags': match.group(3), 'tags': match.group(3),
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', ''), 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or without_scheme,
} }
def parse_pinboard_export(html): def parse_pinboard_export(html):
@ -51,13 +53,13 @@ def parse_pinboard_export(html):
if line: if line:
erg = line erg = line
yield { yield {
'url': erg['href'].replace('http://www.readability.com/read?url=', ''), 'url': erg['href'],
'domain': erg['href'].replace('http://', '').replace('https://', '').split('/')[0], 'domain': erg['href'].replace('http://', '').replace('https://', '').split('/')[0],
'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?')[0], 'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?')[0],
'time': datetime.fromtimestamp(time.mktime(time.strptime(erg['time'].split(',')[0],'%Y-%m-%dT%H:%M:%SZ'))), 'time': datetime.fromtimestamp(time.mktime(time.strptime(erg['time'].split(',')[0],'%Y-%m-%dT%H:%M:%SZ'))),
'timestamp': time.mktime(time.strptime(erg['time'].split(',')[0],'%Y-%m-%dT%H:%M:%SZ')), 'timestamp': time.mktime(time.strptime(erg['time'].split(',')[0],'%Y-%m-%dT%H:%M:%SZ')),
'tags': erg['tags'], 'tags': erg['tags'],
'title': erg['description'].replace(' — Readability', '').replace('http://www.readability.com/read?url=', ''), 'title': erg['description'].replace(' — Readability', ''),
} }
def dump_index(links, service): def dump_index(links, service):