1
0
Fork 0
mirror of synced 2024-06-25 01:20:30 +12:00

remove flawed link_type concept in favor of simpler staticfile detection

This commit is contained in:
Nick Sweeting 2019-03-20 21:11:29 -04:00
parent c79e1df8b2
commit 5ee1c39720
4 changed files with 107 additions and 79 deletions

View file

@ -224,6 +224,7 @@ def write_html_link_index(out_dir, link):
wget_output_path(link)
or (link['domain'] if link['is_archived'] else 'about:blank')
),
'extension': link['extension'] or 'HTML',
}))
chmod_file(path)

View file

@ -10,8 +10,8 @@ Parsed link schema: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'timestamp': '15442123124234',
'title': 'Example.com Page Title',
'tags': 'abc,def',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
'tags': 'abc,def',
}
"""
@ -25,7 +25,6 @@ import xml.etree.ElementTree as etree
from config import ANSI
from util import (
str_between,
get_link_type,
URL_REGEX,
check_url_parsing,
)
@ -69,17 +68,18 @@ def parse_pocket_html_export(html_file):
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line)
if match:
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
time = datetime.fromtimestamp(float(match.group(2)))
info = {
'url': fixed_url,
tags = match.group(3)
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
yield {
'url': url,
'timestamp': str(time.timestamp()),
'tags': match.group(3),
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
'title': title or None,
'tags': tags or '',
'sources': [html_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_pinboard_json_export(json_file):
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
@ -106,14 +106,14 @@ def parse_pinboard_json_export(json_file):
title = (erg.get('description') or '').replace(' — Readability', '')
else:
title = erg['title'].strip()
info = {
'url': url,
'timestamp': timestamp,
'tags': erg.get('tags') or '',
'title': title or None,
'tags': erg.get('tags') or '',
'sources': [json_file.name],
}
info['type'] = get_link_type(info)
yield info
@ -144,16 +144,13 @@ def parse_rss_export(rss_file):
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
info = {
yield {
'url': url,
'timestamp': str(time.timestamp()),
'tags': '',
'title': title or None,
'tags': '',
'sources': [rss_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_shaarli_rss_export(rss_file):
@ -184,16 +181,14 @@ def parse_shaarli_rss_export(rss_file):
ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
info = {
yield {
'url': url,
'timestamp': str(time.timestamp()),
'tags': '',
'title': title or None,
'tags': '',
'sources': [rss_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_netscape_html_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
@ -209,16 +204,14 @@ def parse_netscape_html_export(html_file):
url = match.group(1)
time = datetime.fromtimestamp(float(match.group(2)))
info = {
yield {
'url': url,
'timestamp': str(time.timestamp()),
'tags': "",
'title': match.group(3).strip() or None,
'tags': '',
'sources': [html_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_pinboard_rss_export(rss_file):
"""Parse Pinboard RSS feed files into links"""
@ -237,18 +230,22 @@ def parse_pinboard_rss_export(rss_file):
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ":" == ts_str[-3:-2]:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
info = {
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now()
yield {
'url': url,
'timestamp': str(time.timestamp()),
'tags': tags or '',
'title': title or None,
'tags': tags or '',
'sources': [rss_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_medium_rss_export(rss_file):
"""Parse Medium RSS feed files into links"""
@ -263,15 +260,14 @@ def parse_medium_rss_export(rss_file):
title = item.find("title").text.strip()
ts_str = item.find("pubDate").text
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
info = {
yield {
'url': url,
'timestamp': str(time.timestamp()),
'tags': '',
'title': title or None,
'tags': '',
'sources': [rss_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_plain_text_export(text_file):
@ -285,15 +281,15 @@ def parse_plain_text_export(text_file):
for url in urls:
url = url.strip()
info = {
time = datetime.now()
yield {
'url': url,
'timestamp': str(datetime.now().timestamp()),
'tags': '',
'timestamp': str(time.timestamp()),
'title': None,
'tags': '',
'sources': [text_file.name],
}
info['type'] = get_link_type(info)
yield info
PARSERS = OrderedDict([

View file

@ -194,8 +194,8 @@
Last updated: <small title="Timestamp: $updated">$updated_date</small>
</div>
<div class="col-lg-4 alert well">
Metadata:
<span class="badge badge-default">$type</span>
Type:
<span class="badge badge-default">$extension</span>
&nbsp; | &nbsp;
Tags:
<span class="badge badge-success">$tags</span>

View file

@ -70,6 +70,26 @@ HTML_TITLE_REGEX = re.compile(
r'(.[^<>]+)', # get everything up to these symbols
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
)
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extentions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# Thse are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
### Checks & Tests
@ -225,6 +245,7 @@ def save_remote_source(url, timeout=TIMEOUT):
def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
"""Attempt to guess a page's title by downloading the html"""
if not FETCH_TITLE:
return None
@ -257,8 +278,8 @@ def wget_output_path(link):
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
if link['type'] in ('PDF', 'image'):
return urlencode(base_url(link['url']))
if is_static_file(link['url']):
return urlencode(without_scheme(without_fragment(link['url'])))
# Since the wget algorithm to for -E (appending .html) is incredibly complex
# instead of trying to emulate it here, we just look in the output folder
@ -271,6 +292,18 @@ def wget_output_path(link):
full_path,
)
# Wget downloads can save in a number of different ways depending on the url
# https://example.com
# > output/archive/<timestamp>/example.com/index.html
# https://example.com/abc
# > output/archive/<timestamp>/example.com/abc.html
# https://example.com/abc/
# > output/archive/<timestamp>/example.com/abc/index.html
# https://example.com/abc/test.html
# > output/archive/<timestamp>/example.com/abc/test.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm
for _ in range(4):
if os.path.exists(search_dir):
if os.path.isdir(search_dir):
@ -279,8 +312,8 @@ def wget_output_path(link):
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
]
if html_files:
relative_path = search_dir.split(link_dir)[-1].strip('/')
return urlencode(os.path.join(relative_path, html_files[0]))
path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
return urlencode(os.path.join(path_from_link_dir, html_files[0]))
# Move up one directory level
search_dir = search_dir.rsplit('/', 1)[0]
@ -327,19 +360,32 @@ def pretty_path(path):
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '')
def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
"""quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it save to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg
for arg in cmd
)
# Prettify error output hints string and limit to five lines
hints = hints or getattr(err, 'hints', None)
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
else:
hints = ()
output_lines = [
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
' {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
*hints,
'Run to see full output:'
' cd {};'.format(pwd),
' {}'.format(quoted_cmd),
@ -364,36 +410,21 @@ def merge_links(a, b):
url = longer('url')
longest_title = longer('title')
cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
link = {
'timestamp': earlier('timestamp'),
return {
'url': url,
'domain': domain(url),
'base_url': base_url(url),
'tags': longer('tags'),
'timestamp': earlier('timestamp'),
'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
'tags': longer('tags'),
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
}
link['type'] = get_link_type(link)
return link
def get_link_type(link):
"""Certain types of links need to be handled specially, this figures out when that's the case"""
def is_static_file(url):
"""Certain URLs just point to a single static file, and
don't need to be re-archived in many formats
"""
if extension(link['url']) == 'pdf':
return 'PDF'
elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
return 'image'
elif 'wikipedia.org' in domain(link['url']).lower():
return 'wiki'
elif 'youtube.com' in domain(link['url']).lower():
return 'youtube'
elif 'soundcloud.com' in domain(link['url']).lower():
return 'soundcloud'
elif 'youku.com' in domain(link['url']).lower():
return 'youku'
elif 'vimeo.com' in domain(link['url']).lower():
return 'vimeo'
return None
# TODO: the proper way is with MIME type detection, not using extension
return extension(url) in STATICFILE_EXTENSIONS
def derived_link_info(link):
"""extend link info with the archive urls and other derived data"""
@ -410,7 +441,9 @@ def derived_link_info(link):
'domain': domain(url),
'path': path(url),
'basename': basename(url),
'extension': extension(url),
'base_url': base_url(url),
'is_static': is_static_file(url),
'is_archived': os.path.exists(os.path.join(
ARCHIVE_DIR,
link['timestamp'],
@ -420,8 +453,7 @@ def derived_link_info(link):
}
# Archive Method Output URLs
extended_info = {
**extended_info,
extended_info.update({
'index_url': 'index.html',
'favicon_url': 'favicon.ico',
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
@ -433,14 +465,13 @@ def derived_link_info(link):
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
'git_url': 'git',
'media_url': 'media',
}
# PDF and images are handled slightly differently
# wget, screenshot, & pdf urls all point to the same file
if link['type'] in ('PDF', 'image'):
})
# static binary files like PDF and images are handled slightly differently.
# they're just downloaded once and aren't archived separately multiple times,
# so the wget, screenshot, & pdf urls should all point to the same file
if is_static_file(url):
extended_info.update({
'title': basename(link['url']),
'title': basename(url),
'archive_url': base_url(url),
'pdf_url': base_url(url),
'screenshot_url': base_url(url),