1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

remove flawed link_type concept in favor of simpler staticfile detection

This commit is contained in:
Nick Sweeting 2019-03-20 21:11:29 -04:00
parent c79e1df8b2
commit 5ee1c39720
4 changed files with 107 additions and 79 deletions

View file

@ -224,6 +224,7 @@ def write_html_link_index(out_dir, link):
wget_output_path(link) wget_output_path(link)
or (link['domain'] if link['is_archived'] else 'about:blank') or (link['domain'] if link['is_archived'] else 'about:blank')
), ),
'extension': link['extension'] or 'HTML',
})) }))
chmod_file(path) chmod_file(path)

View file

@ -10,8 +10,8 @@ Parsed link schema: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'timestamp': '15442123124234', 'timestamp': '15442123124234',
'title': 'Example.com Page Title', 'title': 'Example.com Page Title',
'tags': 'abc,def',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'], 'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
'tags': 'abc,def',
} }
""" """
@ -25,7 +25,6 @@ import xml.etree.ElementTree as etree
from config import ANSI from config import ANSI
from util import ( from util import (
str_between, str_between,
get_link_type,
URL_REGEX, URL_REGEX,
check_url_parsing, check_url_parsing,
) )
@ -69,17 +68,18 @@ def parse_pocket_html_export(html_file):
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li> # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line) match = pattern.search(line)
if match: if match:
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
time = datetime.fromtimestamp(float(match.group(2))) time = datetime.fromtimestamp(float(match.group(2)))
info = { tags = match.group(3)
'url': fixed_url, title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
yield {
'url': url,
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': match.group(3), 'title': title or None,
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None, 'tags': tags or '',
'sources': [html_file.name], 'sources': [html_file.name],
} }
info['type'] = get_link_type(info)
yield info
def parse_pinboard_json_export(json_file): def parse_pinboard_json_export(json_file):
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
@ -106,14 +106,14 @@ def parse_pinboard_json_export(json_file):
title = (erg.get('description') or '').replace(' — Readability', '') title = (erg.get('description') or '').replace(' — Readability', '')
else: else:
title = erg['title'].strip() title = erg['title'].strip()
info = { info = {
'url': url, 'url': url,
'timestamp': timestamp, 'timestamp': timestamp,
'tags': erg.get('tags') or '',
'title': title or None, 'title': title or None,
'tags': erg.get('tags') or '',
'sources': [json_file.name], 'sources': [json_file.name],
} }
info['type'] = get_link_type(info)
yield info yield info
@ -144,16 +144,13 @@ def parse_rss_export(rss_file):
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>') ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
info = { yield {
'url': url, 'url': url,
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '',
'title': title or None, 'title': title or None,
'tags': '',
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info)
yield info
def parse_shaarli_rss_export(rss_file): def parse_shaarli_rss_export(rss_file):
@ -184,16 +181,14 @@ def parse_shaarli_rss_export(rss_file):
ts_str = str_between(get_row('published'), '<published>', '</published>') ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
info = { yield {
'url': url, 'url': url,
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '',
'title': title or None, 'title': title or None,
'tags': '',
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info)
yield info
def parse_netscape_html_export(html_file): def parse_netscape_html_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)""" """Parse netscape-format bookmarks export files (produced by all browsers)"""
@ -209,16 +204,14 @@ def parse_netscape_html_export(html_file):
url = match.group(1) url = match.group(1)
time = datetime.fromtimestamp(float(match.group(2))) time = datetime.fromtimestamp(float(match.group(2)))
info = { yield {
'url': url, 'url': url,
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': "",
'title': match.group(3).strip() or None, 'title': match.group(3).strip() or None,
'tags': '',
'sources': [html_file.name], 'sources': [html_file.name],
} }
info['type'] = get_link_type(info)
yield info
def parse_pinboard_rss_export(rss_file): def parse_pinboard_rss_export(rss_file):
"""Parse Pinboard RSS feed files into links""" """Parse Pinboard RSS feed files into links"""
@ -237,18 +230,22 @@ def parse_pinboard_rss_export(rss_file):
# Pinboard includes a colon in its date stamp timezone offsets, which # Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it: # Python can't parse. Remove it:
if ":" == ts_str[-3:-2]: if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:] ts_str = ts_str[:-3]+ts_str[-2:]
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
info = { if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now()
yield {
'url': url, 'url': url,
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': tags or '',
'title': title or None, 'title': title or None,
'tags': tags or '',
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info)
yield info
def parse_medium_rss_export(rss_file): def parse_medium_rss_export(rss_file):
"""Parse Medium RSS feed files into links""" """Parse Medium RSS feed files into links"""
@ -263,15 +260,14 @@ def parse_medium_rss_export(rss_file):
title = item.find("title").text.strip() title = item.find("title").text.strip()
ts_str = item.find("pubDate").text ts_str = item.find("pubDate").text
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
info = {
yield {
'url': url, 'url': url,
'timestamp': str(time.timestamp()), 'timestamp': str(time.timestamp()),
'tags': '',
'title': title or None, 'title': title or None,
'tags': '',
'sources': [rss_file.name], 'sources': [rss_file.name],
} }
info['type'] = get_link_type(info)
yield info
def parse_plain_text_export(text_file): def parse_plain_text_export(text_file):
@ -285,15 +281,15 @@ def parse_plain_text_export(text_file):
for url in urls: for url in urls:
url = url.strip() url = url.strip()
info = { time = datetime.now()
yield {
'url': url, 'url': url,
'timestamp': str(datetime.now().timestamp()), 'timestamp': str(time.timestamp()),
'tags': '',
'title': None, 'title': None,
'tags': '',
'sources': [text_file.name], 'sources': [text_file.name],
} }
info['type'] = get_link_type(info)
yield info
PARSERS = OrderedDict([ PARSERS = OrderedDict([

View file

@ -194,8 +194,8 @@
Last updated: <small title="Timestamp: $updated">$updated_date</small> Last updated: <small title="Timestamp: $updated">$updated_date</small>
</div> </div>
<div class="col-lg-4 alert well"> <div class="col-lg-4 alert well">
Metadata: Type:
<span class="badge badge-default">$type</span> <span class="badge badge-default">$extension</span>
&nbsp; | &nbsp; &nbsp; | &nbsp;
Tags: Tags:
<span class="badge badge-success">$tags</span> <span class="badge badge-success">$tags</span>

View file

@ -70,6 +70,26 @@ HTML_TITLE_REGEX = re.compile(
r'(.[^<>]+)', # get everything up to these symbols r'(.[^<>]+)', # get everything up to these symbols
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
) )
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extentions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# Thse are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
### Checks & Tests ### Checks & Tests
@ -225,6 +245,7 @@ def save_remote_source(url, timeout=TIMEOUT):
def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
"""Attempt to guess a page's title by downloading the html""" """Attempt to guess a page's title by downloading the html"""
if not FETCH_TITLE: if not FETCH_TITLE:
return None return None
@ -257,8 +278,8 @@ def wget_output_path(link):
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
if link['type'] in ('PDF', 'image'): if is_static_file(link['url']):
return urlencode(base_url(link['url'])) return urlencode(without_scheme(without_fragment(link['url'])))
# Since the wget algorithm to for -E (appending .html) is incredibly complex # Since the wget algorithm to for -E (appending .html) is incredibly complex
# instead of trying to emulate it here, we just look in the output folder # instead of trying to emulate it here, we just look in the output folder
@ -271,6 +292,18 @@ def wget_output_path(link):
full_path, full_path,
) )
# Wget downloads can save in a number of different ways depending on the url
# https://example.com
# > output/archive/<timestamp>/example.com/index.html
# https://example.com/abc
# > output/archive/<timestamp>/example.com/abc.html
# https://example.com/abc/
# > output/archive/<timestamp>/example.com/abc/index.html
# https://example.com/abc/test.html
# > output/archive/<timestamp>/example.com/abc/test.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm
for _ in range(4): for _ in range(4):
if os.path.exists(search_dir): if os.path.exists(search_dir):
if os.path.isdir(search_dir): if os.path.isdir(search_dir):
@ -279,8 +312,8 @@ def wget_output_path(link):
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M) if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
] ]
if html_files: if html_files:
relative_path = search_dir.split(link_dir)[-1].strip('/') path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
return urlencode(os.path.join(relative_path, html_files[0])) return urlencode(os.path.join(path_from_link_dir, html_files[0]))
# Move up one directory level # Move up one directory level
search_dir = search_dir.rsplit('/', 1)[0] search_dir = search_dir.rsplit('/', 1)[0]
@ -327,19 +360,32 @@ def pretty_path(path):
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '') return path.replace(REPO_DIR + '/', '')
def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '): def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
"""quote the argument with whitespace in a command so the user can """quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd copy-paste the outputted string directly to run the cmd
""" """
# Prettify CMD string and make it save to copy-paste by quoting arguments
quoted_cmd = ' '.join( quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg '"{}"'.format(arg) if ' ' in arg else arg
for arg in cmd for arg in cmd
) )
# Prettify error output hints string and limit to five lines
hints = hints or getattr(err, 'hints', None)
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
else:
hints = ()
output_lines = [ output_lines = [
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']), '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
' {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None, *hints,
'Run to see full output:' 'Run to see full output:'
' cd {};'.format(pwd), ' cd {};'.format(pwd),
' {}'.format(quoted_cmd), ' {}'.format(quoted_cmd),
@ -364,36 +410,21 @@ def merge_links(a, b):
url = longer('url') url = longer('url')
longest_title = longer('title') longest_title = longer('title')
cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title'] cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
link = { return {
'timestamp': earlier('timestamp'),
'url': url, 'url': url,
'domain': domain(url), 'timestamp': earlier('timestamp'),
'base_url': base_url(url),
'tags': longer('tags'),
'title': longest_title if '://' not in (longest_title or '') else cleanest_title, 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
'tags': longer('tags'),
'sources': list(set(a.get('sources', []) + b.get('sources', []))), 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
} }
link['type'] = get_link_type(link)
return link
def get_link_type(link): def is_static_file(url):
"""Certain types of links need to be handled specially, this figures out when that's the case""" """Certain URLs just point to a single static file, and
don't need to be re-archived in many formats
"""
if extension(link['url']) == 'pdf': # TODO: the proper way is with MIME type detection, not using extension
return 'PDF' return extension(url) in STATICFILE_EXTENSIONS
elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
return 'image'
elif 'wikipedia.org' in domain(link['url']).lower():
return 'wiki'
elif 'youtube.com' in domain(link['url']).lower():
return 'youtube'
elif 'soundcloud.com' in domain(link['url']).lower():
return 'soundcloud'
elif 'youku.com' in domain(link['url']).lower():
return 'youku'
elif 'vimeo.com' in domain(link['url']).lower():
return 'vimeo'
return None
def derived_link_info(link): def derived_link_info(link):
"""extend link info with the archive urls and other derived data""" """extend link info with the archive urls and other derived data"""
@ -410,7 +441,9 @@ def derived_link_info(link):
'domain': domain(url), 'domain': domain(url),
'path': path(url), 'path': path(url),
'basename': basename(url), 'basename': basename(url),
'extension': extension(url),
'base_url': base_url(url), 'base_url': base_url(url),
'is_static': is_static_file(url),
'is_archived': os.path.exists(os.path.join( 'is_archived': os.path.exists(os.path.join(
ARCHIVE_DIR, ARCHIVE_DIR,
link['timestamp'], link['timestamp'],
@ -420,8 +453,7 @@ def derived_link_info(link):
} }
# Archive Method Output URLs # Archive Method Output URLs
extended_info = { extended_info.update({
**extended_info,
'index_url': 'index.html', 'index_url': 'index.html',
'favicon_url': 'favicon.ico', 'favicon_url': 'favicon.ico',
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info), 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
@ -433,14 +465,13 @@ def derived_link_info(link):
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info), 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
'git_url': 'git', 'git_url': 'git',
'media_url': 'media', 'media_url': 'media',
})
} # static binary files like PDF and images are handled slightly differently.
# they're just downloaded once and aren't archived separately multiple times,
# PDF and images are handled slightly differently # so the wget, screenshot, & pdf urls should all point to the same file
# wget, screenshot, & pdf urls all point to the same file if is_static_file(url):
if link['type'] in ('PDF', 'image'):
extended_info.update({ extended_info.update({
'title': basename(link['url']), 'title': basename(url),
'archive_url': base_url(url), 'archive_url': base_url(url),
'pdf_url': base_url(url), 'pdf_url': base_url(url),
'screenshot_url': base_url(url), 'screenshot_url': base_url(url),