From 09d79e55a06b2a78866c7cd04a26cbd46b0dc8e5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 22 Feb 2019 12:56:36 -0500 Subject: [PATCH] remove derivable link info from links --- archivebox/parse.py | 32 +++++++------------------------- archivebox/util.py | 17 +++++++++++++++-- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/archivebox/parse.py b/archivebox/parse.py index ee8865f0..1ec6f0cc 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -8,27 +8,22 @@ For examples of supported files see examples/. Parsed link schema: { 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', - 'domain': 'example.com', - 'base_url': 'example.com/example/', 'timestamp': '15442123124234', - 'tags': 'abc,def', 'title': 'Example.com Page Title', + 'tags': 'abc,def', 'sources': ['ril_export.html', 'downloads/getpocket.com.txt'], } """ import re -import sys import json + +from datetime import datetime from collections import OrderedDict import xml.etree.ElementTree as etree -from datetime import datetime - -from config import ANSI, SHOW_PROGRESS +from config import ANSI from util import ( - domain, - base_url, str_between, get_link_type, URL_REGEX, @@ -90,8 +85,6 @@ def parse_pocket_html_export(html_file): time = datetime.fromtimestamp(float(match.group(2))) info = { 'url': fixed_url, - 'domain': domain(fixed_url), - 'base_url': base_url(fixed_url), 'timestamp': str(time.timestamp()), 'tags': match.group(3), 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None, @@ -127,8 +120,6 @@ def parse_pinboard_json_export(json_file): title = erg['title'].strip() info = { 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), 'timestamp': timestamp, 'tags': erg.get('tags') or '', 'title': title or None, @@ -137,6 +128,7 @@ def parse_pinboard_json_export(json_file): info['type'] = get_link_type(info) yield info + def parse_rss_export(rss_file): """Parse RSS XML-format files into links""" @@ -166,8 +158,6 @@ def parse_rss_export(rss_file): info = { 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or None, @@ -177,6 +167,7 @@ def parse_rss_export(rss_file): yield info + def parse_shaarli_rss_export(rss_file): """Parse Shaarli-specific RSS XML-format files into links""" @@ -207,8 +198,6 @@ def parse_shaarli_rss_export(rss_file): info = { 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or None, @@ -218,6 +207,7 @@ def parse_shaarli_rss_export(rss_file): yield info + def parse_netscape_html_export(html_file): """Parse netscape-format bookmarks export files (produced by all browsers)""" @@ -234,8 +224,6 @@ def parse_netscape_html_export(html_file): info = { 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': "", 'title': match.group(3).strip() or None, @@ -267,8 +255,6 @@ def parse_pinboard_rss_export(rss_file): time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") info = { 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': tags, 'title': title or None, @@ -292,8 +278,6 @@ def parse_medium_rss_export(rss_file): time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") info = { 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or None, @@ -316,8 +300,6 @@ def parse_plain_text_export(text_file): url = url.strip() info = { 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), 'timestamp': str(datetime.now().timestamp()), 'tags': '', 'title': None, diff --git a/archivebox/util.py b/archivebox/util.py index fcaa13b0..9c93a9fd 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -559,18 +559,30 @@ def wget_output_path(link, look_in=None): def derived_link_info(link): """extend link info with the archive urls and other derived data""" + url = link['url'] + link_info = { **link, + 'title': link['title'] or url, 'date': datetime.fromtimestamp(Decimal(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), - 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), + 'base_url': base_url(url), + 'domain': domain(url), + 'basename': basename(url), + 'path': path(url), + + # Archive Method Output URLs 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link), + 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), 'files_url': 'archive/{timestamp}/index.html'.format(**link), 'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'), + 'warc_url': 'archive/{timestamp}/warc'.format(**link), 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link), 'dom_link': 'archive/{timestamp}/output.html'.format(**link), 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), - 'title': link['title'] or link['url'], + 'git_url': 'archive/{timestamp}/git'.format(**link), + 'media_url': 'archive/{timestamp}/media'.format(**link), + } # PDF and images are handled slightly differently @@ -583,6 +595,7 @@ def derived_link_info(link): 'dom_link': 'archive/{timestamp}/{base_url}'.format(**link), 'title': link['title'] or basename(link['url']), }) + return link_info