From 5ee1c3972095af9c2eed38087387df082923202b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 20 Mar 2019 21:11:29 -0400 Subject: [PATCH] remove flawed link_type concept in favor of simpler staticfile detection --- archivebox/index.py | 1 + archivebox/parse.py | 74 +++++++++--------- archivebox/templates/link_index.html | 4 +- archivebox/util.py | 107 +++++++++++++++++---------- 4 files changed, 107 insertions(+), 79 deletions(-) diff --git a/archivebox/index.py b/archivebox/index.py index 6351c31a..694ea1dc 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -224,6 +224,7 @@ def write_html_link_index(out_dir, link): wget_output_path(link) or (link['domain'] if link['is_archived'] else 'about:blank') ), + 'extension': link['extension'] or 'HTML', })) chmod_file(path) diff --git a/archivebox/parse.py b/archivebox/parse.py index 69a37014..5549bea1 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -10,8 +10,8 @@ Parsed link schema: { 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', 'timestamp': '15442123124234', 'title': 'Example.com Page Title', - 'tags': 'abc,def', 'sources': ['ril_export.html', 'downloads/getpocket.com.txt'], + 'tags': 'abc,def', } """ @@ -25,7 +25,6 @@ import xml.etree.ElementTree as etree from config import ANSI from util import ( str_between, - get_link_type, URL_REGEX, check_url_parsing, ) @@ -69,17 +68,18 @@ def parse_pocket_html_export(html_file): #
  • example title
  • match = pattern.search(line) if match: - fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url + url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url time = datetime.fromtimestamp(float(match.group(2))) - info = { - 'url': fixed_url, + tags = match.group(3) + title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') + + yield { + 'url': url, 'timestamp': str(time.timestamp()), - 'tags': match.group(3), - 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None, + 'title': title or None, + 'tags': tags or '', 'sources': [html_file.name], } - info['type'] = get_link_type(info) - yield info def parse_pinboard_json_export(json_file): """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" @@ -106,14 +106,14 @@ def parse_pinboard_json_export(json_file): title = (erg.get('description') or '').replace(' — Readability', '') else: title = erg['title'].strip() + info = { 'url': url, 'timestamp': timestamp, - 'tags': erg.get('tags') or '', 'title': title or None, + 'tags': erg.get('tags') or '', 'sources': [json_file.name], } - info['type'] = get_link_type(info) yield info @@ -144,16 +144,13 @@ def parse_rss_export(rss_file): ts_str = str_between(get_row('pubDate'), '', '') time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") - info = { + yield { 'url': url, 'timestamp': str(time.timestamp()), - 'tags': '', 'title': title or None, + 'tags': '', 'sources': [rss_file.name], } - info['type'] = get_link_type(info) - - yield info def parse_shaarli_rss_export(rss_file): @@ -184,16 +181,14 @@ def parse_shaarli_rss_export(rss_file): ts_str = str_between(get_row('published'), '', '') time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - info = { + yield { 'url': url, 'timestamp': str(time.timestamp()), - 'tags': '', 'title': title or None, + 'tags': '', 'sources': [rss_file.name], } - info['type'] = get_link_type(info) - yield info def parse_netscape_html_export(html_file): """Parse netscape-format bookmarks export files (produced by all browsers)""" @@ -209,16 +204,14 @@ def parse_netscape_html_export(html_file): url = match.group(1) time = datetime.fromtimestamp(float(match.group(2))) - info = { + yield { 'url': url, 'timestamp': str(time.timestamp()), - 'tags': "", 'title': match.group(3).strip() or None, + 'tags': '', 'sources': [html_file.name], } - info['type'] = get_link_type(info) - yield info def parse_pinboard_rss_export(rss_file): """Parse Pinboard RSS feed files into links""" @@ -237,18 +230,22 @@ def parse_pinboard_rss_export(rss_file): # Pinboard includes a colon in its date stamp timezone offsets, which # Python can't parse. Remove it: - if ":" == ts_str[-3:-2]: + if ts_str and ts_str[-3:-2] == ":": ts_str = ts_str[:-3]+ts_str[-2:] - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - info = { + + if ts_str: + time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") + else: + time = datetime.now() + + yield { 'url': url, 'timestamp': str(time.timestamp()), - 'tags': tags or '', 'title': title or None, + 'tags': tags or '', 'sources': [rss_file.name], } - info['type'] = get_link_type(info) - yield info + def parse_medium_rss_export(rss_file): """Parse Medium RSS feed files into links""" @@ -263,15 +260,14 @@ def parse_medium_rss_export(rss_file): title = item.find("title").text.strip() ts_str = item.find("pubDate").text time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") - info = { + + yield { 'url': url, 'timestamp': str(time.timestamp()), - 'tags': '', 'title': title or None, + 'tags': '', 'sources': [rss_file.name], } - info['type'] = get_link_type(info) - yield info def parse_plain_text_export(text_file): @@ -285,15 +281,15 @@ def parse_plain_text_export(text_file): for url in urls: url = url.strip() - info = { + time = datetime.now() + + yield { 'url': url, - 'timestamp': str(datetime.now().timestamp()), - 'tags': '', + 'timestamp': str(time.timestamp()), 'title': None, + 'tags': '', 'sources': [text_file.name], } - info['type'] = get_link_type(info) - yield info PARSERS = OrderedDict([ diff --git a/archivebox/templates/link_index.html b/archivebox/templates/link_index.html index 29927ea3..9d286ae0 100644 --- a/archivebox/templates/link_index.html +++ b/archivebox/templates/link_index.html @@ -194,8 +194,8 @@ Last updated: $updated_date
    - Metadata: - $type + Type: + $extension   |   Tags: $tags diff --git a/archivebox/util.py b/archivebox/util.py index 85648ee2..189de476 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -70,6 +70,26 @@ HTML_TITLE_REGEX = re.compile( r'(.[^<>]+)', # get everything up to these symbols re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, ) +STATICFILE_EXTENSIONS = { + # 99.999% of the time, URLs ending in these extentions are static files + # that can be downloaded as-is, not html pages that need to be rendered + 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', + 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', + 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8' + 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', + 'atom', 'rss', 'css', 'js', 'json', + 'dmg', 'iso', 'img', + 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z', + + # Less common extensions to consider adding later + # jar, swf, bin, com, exe, dll, deb + # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, + # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, + # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml + + # Thse are always treated as pages, not as static files, never add them: + # html, htm, shtml, xhtml, xml, aspx, php, cgi +} ### Checks & Tests @@ -225,6 +245,7 @@ def save_remote_source(url, timeout=TIMEOUT): def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): """Attempt to guess a page's title by downloading the html""" + if not FETCH_TITLE: return None @@ -257,8 +278,8 @@ def wget_output_path(link): urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') - if link['type'] in ('PDF', 'image'): - return urlencode(base_url(link['url'])) + if is_static_file(link['url']): + return urlencode(without_scheme(without_fragment(link['url']))) # Since the wget algorithm to for -E (appending .html) is incredibly complex # instead of trying to emulate it here, we just look in the output folder @@ -271,6 +292,18 @@ def wget_output_path(link): full_path, ) + # Wget downloads can save in a number of different ways depending on the url + # https://example.com + # > output/archive//example.com/index.html + # https://example.com/abc + # > output/archive//example.com/abc.html + # https://example.com/abc/ + # > output/archive//example.com/abc/index.html + # https://example.com/abc/test.html + # > output/archive//example.com/abc/test.html + + # There's also lots of complexity around how the urlencoding and renaming + # is done for pages with query and hash fragments or extensions like shtml / htm for _ in range(4): if os.path.exists(search_dir): if os.path.isdir(search_dir): @@ -279,8 +312,8 @@ def wget_output_path(link): if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M) ] if html_files: - relative_path = search_dir.split(link_dir)[-1].strip('/') - return urlencode(os.path.join(relative_path, html_files[0])) + path_from_link_dir = search_dir.split(link_dir)[-1].strip('/') + return urlencode(os.path.join(path_from_link_dir, html_files[0])) # Move up one directory level search_dir = search_dir.rsplit('/', 1)[0] @@ -327,19 +360,32 @@ def pretty_path(path): """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" return path.replace(REPO_DIR + '/', '') + def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '): """quote the argument with whitespace in a command so the user can copy-paste the outputted string directly to run the cmd """ + # Prettify CMD string and make it save to copy-paste by quoting arguments quoted_cmd = ' '.join( '"{}"'.format(arg) if ' ' in arg else arg for arg in cmd ) + # Prettify error output hints string and limit to five lines + hints = hints or getattr(err, 'hints', None) + if hints: + hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') + hints = ( + ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) + for line in hints[:5] if line.strip() + ) + else: + hints = () + output_lines = [ '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']), - ' {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None, + *hints, 'Run to see full output:' ' cd {};'.format(pwd), ' {}'.format(quoted_cmd), @@ -364,36 +410,21 @@ def merge_links(a, b): url = longer('url') longest_title = longer('title') cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title'] - link = { - 'timestamp': earlier('timestamp'), + return { 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), - 'tags': longer('tags'), + 'timestamp': earlier('timestamp'), 'title': longest_title if '://' not in (longest_title or '') else cleanest_title, + 'tags': longer('tags'), 'sources': list(set(a.get('sources', []) + b.get('sources', []))), } - link['type'] = get_link_type(link) - return link -def get_link_type(link): - """Certain types of links need to be handled specially, this figures out when that's the case""" +def is_static_file(url): + """Certain URLs just point to a single static file, and + don't need to be re-archived in many formats + """ - if extension(link['url']) == 'pdf': - return 'PDF' - elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'): - return 'image' - elif 'wikipedia.org' in domain(link['url']).lower(): - return 'wiki' - elif 'youtube.com' in domain(link['url']).lower(): - return 'youtube' - elif 'soundcloud.com' in domain(link['url']).lower(): - return 'soundcloud' - elif 'youku.com' in domain(link['url']).lower(): - return 'youku' - elif 'vimeo.com' in domain(link['url']).lower(): - return 'vimeo' - return None + # TODO: the proper way is with MIME type detection, not using extension + return extension(url) in STATICFILE_EXTENSIONS def derived_link_info(link): """extend link info with the archive urls and other derived data""" @@ -410,7 +441,9 @@ def derived_link_info(link): 'domain': domain(url), 'path': path(url), 'basename': basename(url), + 'extension': extension(url), 'base_url': base_url(url), + 'is_static': is_static_file(url), 'is_archived': os.path.exists(os.path.join( ARCHIVE_DIR, link['timestamp'], @@ -420,8 +453,7 @@ def derived_link_info(link): } # Archive Method Output URLs - extended_info = { - **extended_info, + extended_info.update({ 'index_url': 'index.html', 'favicon_url': 'favicon.ico', 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info), @@ -433,14 +465,13 @@ def derived_link_info(link): 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info), 'git_url': 'git', 'media_url': 'media', - - } - - # PDF and images are handled slightly differently - # wget, screenshot, & pdf urls all point to the same file - if link['type'] in ('PDF', 'image'): + }) + # static binary files like PDF and images are handled slightly differently. + # they're just downloaded once and aren't archived separately multiple times, + # so the wget, screenshot, & pdf urls should all point to the same file + if is_static_file(url): extended_info.update({ - 'title': basename(link['url']), + 'title': basename(url), 'archive_url': base_url(url), 'pdf_url': base_url(url), 'screenshot_url': base_url(url),