1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

remove derivable link info from links

This commit is contained in:
Nick Sweeting 2019-02-22 12:56:36 -05:00
parent eb003f6a26
commit 09d79e55a0
2 changed files with 22 additions and 27 deletions

View file

@ -8,27 +8,22 @@ For examples of supported files see examples/.
Parsed link schema: { Parsed link schema: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'domain': 'example.com',
'base_url': 'example.com/example/',
'timestamp': '15442123124234', 'timestamp': '15442123124234',
'tags': 'abc,def',
'title': 'Example.com Page Title', 'title': 'Example.com Page Title',
'tags': 'abc,def',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'], 'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
} }
""" """
import re import re
import sys
import json import json
from datetime import datetime
from collections import OrderedDict from collections import OrderedDict
import xml.etree.ElementTree as etree import xml.etree.ElementTree as etree
from datetime import datetime from config import ANSI
from config import ANSI, SHOW_PROGRESS
from util import ( from util import (
domain,
base_url,
str_between, str_between,
get_link_type, get_link_type,
URL_REGEX, URL_REGEX,
@ -90,8 +85,6 @@ def parse_pocket_html_export(html_file):
time = datetime.fromtimestamp(float(match.group(2))) time = datetime.fromtimestamp(float(match.group(2)))
info = { info = {
'url': fixed_url, 'url': fixed_url,
'domain': domain(fixed_url),
'base_url': base_url(fixed_url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': match.group(3), 'tags': match.group(3),
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None, 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
@ -127,8 +120,6 @@ def parse_pinboard_json_export(json_file):
title = erg['title'].strip() title = erg['title'].strip()
info = { info = {
'url': url, 'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': timestamp, 'timestamp': timestamp,
'tags': erg.get('tags') or '', 'tags': erg.get('tags') or '',
'title': title or None, 'title': title or None,
@ -137,6 +128,7 @@ def parse_pinboard_json_export(json_file):
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
yield info yield info
def parse_rss_export(rss_file): def parse_rss_export(rss_file):
"""Parse RSS XML-format files into links""" """Parse RSS XML-format files into links"""
@ -166,8 +158,6 @@ def parse_rss_export(rss_file):
info = { info = {
'url': url, 'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '', 'tags': '',
'title': title or None, 'title': title or None,
@ -177,6 +167,7 @@ def parse_rss_export(rss_file):
yield info yield info
def parse_shaarli_rss_export(rss_file): def parse_shaarli_rss_export(rss_file):
"""Parse Shaarli-specific RSS XML-format files into links""" """Parse Shaarli-specific RSS XML-format files into links"""
@ -207,8 +198,6 @@ def parse_shaarli_rss_export(rss_file):
info = { info = {
'url': url, 'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '', 'tags': '',
'title': title or None, 'title': title or None,
@ -218,6 +207,7 @@ def parse_shaarli_rss_export(rss_file):
yield info yield info
def parse_netscape_html_export(html_file): def parse_netscape_html_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)""" """Parse netscape-format bookmarks export files (produced by all browsers)"""
@ -234,8 +224,6 @@ def parse_netscape_html_export(html_file):
info = { info = {
'url': url, 'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': "", 'tags': "",
'title': match.group(3).strip() or None, 'title': match.group(3).strip() or None,
@ -267,8 +255,6 @@ def parse_pinboard_rss_export(rss_file):
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
info = { info = {
'url': url, 'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': tags, 'tags': tags,
'title': title or None, 'title': title or None,
@ -292,8 +278,6 @@ def parse_medium_rss_export(rss_file):
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
info = { info = {
'url': url, 'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '', 'tags': '',
'title': title or None, 'title': title or None,
@ -316,8 +300,6 @@ def parse_plain_text_export(text_file):
url = url.strip() url = url.strip()
info = { info = {
'url': url, 'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(datetime.now().timestamp()), 'timestamp': str(datetime.now().timestamp()),
'tags': '', 'tags': '',
'title': None, 'title': None,

View file

@ -559,18 +559,30 @@ def wget_output_path(link, look_in=None):
def derived_link_info(link): def derived_link_info(link):
"""extend link info with the archive urls and other derived data""" """extend link info with the archive urls and other derived data"""
url = link['url']
link_info = { link_info = {
**link, **link,
'title': link['title'] or url,
'date': datetime.fromtimestamp(Decimal(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), 'date': datetime.fromtimestamp(Decimal(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), 'base_url': base_url(url),
'domain': domain(url),
'basename': basename(url),
'path': path(url),
# Archive Method Output URLs
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link), 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
'files_url': 'archive/{timestamp}/index.html'.format(**link), 'files_url': 'archive/{timestamp}/index.html'.format(**link),
'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'), 'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'),
'warc_url': 'archive/{timestamp}/warc'.format(**link),
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link), 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
'dom_link': 'archive/{timestamp}/output.html'.format(**link), 'dom_link': 'archive/{timestamp}/output.html'.format(**link),
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
'title': link['title'] or link['url'], 'git_url': 'archive/{timestamp}/git'.format(**link),
'media_url': 'archive/{timestamp}/media'.format(**link),
} }
# PDF and images are handled slightly differently # PDF and images are handled slightly differently
@ -583,6 +595,7 @@ def derived_link_info(link):
'dom_link': 'archive/{timestamp}/{base_url}'.format(**link), 'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
'title': link['title'] or basename(link['url']), 'title': link['title'] or basename(link['url']),
}) })
return link_info return link_info