From 5ee1c3972095af9c2eed38087387df082923202b Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Wed, 20 Mar 2019 21:11:29 -0400
Subject: [PATCH] remove flawed link_type concept in favor of simpler
 staticfile detection

---
 archivebox/index.py                  |   1 +
 archivebox/parse.py                  |  74 +++++++++---------
 archivebox/templates/link_index.html |   4 +-
 archivebox/util.py                   | 107 +++++++++++++++++----------
 4 files changed, 107 insertions(+), 79 deletions(-)
diff --git a/archivebox/index.py b/archivebox/index.py
index 6351c31a..694ea1dc 100644
--- a/archivebox/index.py
+++ b/archivebox/index.py
@@ -224,6 +224,7 @@ def write_html_link_index(out_dir, link):
                 wget_output_path(link)
                 or (link['domain'] if link['is_archived'] else 'about:blank')
             ),
+            'extension': link['extension'] or 'HTML',
         }))
 
     chmod_file(path)
diff --git a/archivebox/parse.py b/archivebox/parse.py
index 69a37014..5549bea1 100644
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@@ -10,8 +10,8 @@ Parsed link schema: {
     'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
     'timestamp': '15442123124234',
     'title': 'Example.com Page Title',
-    'tags': 'abc,def',
     'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
+    'tags': 'abc,def',
 }
 """
 
@@ -25,7 +25,6 @@ import xml.etree.ElementTree as etree
 from config import ANSI
 from util import (
     str_between,
-    get_link_type,
     URL_REGEX,
     check_url_parsing,
 )
@@ -69,17 +68,18 @@ def parse_pocket_html_export(html_file):
         # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
         match = pattern.search(line)
         if match:
-            fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
+            url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
             time = datetime.fromtimestamp(float(match.group(2)))
-            info = {
-                'url': fixed_url,
+            tags = match.group(3)
+            title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
+            
+            yield {
+                'url': url,
                 'timestamp': str(time.timestamp()),
-                'tags': match.group(3),
-                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
+                'title': title or None,
+                'tags': tags or '',
                 'sources': [html_file.name],
             }
-            info['type'] = get_link_type(info)
-            yield info
 
 def parse_pinboard_json_export(json_file):
     """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
@@ -106,14 +106,14 @@ def parse_pinboard_json_export(json_file):
                 title = (erg.get('description') or '').replace(' — Readability', '')
             else:
                 title = erg['title'].strip()
+
             info = {
                 'url': url,
                 'timestamp': timestamp,
-                'tags': erg.get('tags') or '',
                 'title': title or None,
+                'tags': erg.get('tags') or '',
                 'sources': [json_file.name],
             }
-            info['type'] = get_link_type(info)
             yield info
 
 
@@ -144,16 +144,13 @@ def parse_rss_export(rss_file):
         ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
         time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
 
-        info = {
+        yield {
             'url': url,
             'timestamp': str(time.timestamp()),
-            'tags': '',
             'title': title or None,
+            'tags': '',
             'sources': [rss_file.name],
         }
-        info['type'] = get_link_type(info)
-
-        yield info
 
 
 def parse_shaarli_rss_export(rss_file):
@@ -184,16 +181,14 @@ def parse_shaarli_rss_export(rss_file):
         ts_str = str_between(get_row('published'), '<published>', '</published>')
         time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
 
-        info = {
+        yield {
             'url': url,
             'timestamp': str(time.timestamp()),
-            'tags': '',
             'title': title or None,
+            'tags': '',
             'sources': [rss_file.name],
         }
-        info['type'] = get_link_type(info)
 
-        yield info
 
 def parse_netscape_html_export(html_file):
     """Parse netscape-format bookmarks export files (produced by all browsers)"""
@@ -209,16 +204,14 @@ def parse_netscape_html_export(html_file):
             url = match.group(1)
             time = datetime.fromtimestamp(float(match.group(2)))
 
-            info = {
+            yield {
                 'url': url,
                 'timestamp': str(time.timestamp()),
-                'tags': "",
                 'title': match.group(3).strip() or None,
+                'tags': '',
                 'sources': [html_file.name],
             }
-            info['type'] = get_link_type(info)
 
-            yield info
 
 def parse_pinboard_rss_export(rss_file):
     """Parse Pinboard RSS feed files into links"""
@@ -237,18 +230,22 @@ def parse_pinboard_rss_export(rss_file):
         
         # Pinboard includes a colon in its date stamp timezone offsets, which
         # Python can't parse. Remove it:
-        if ":" == ts_str[-3:-2]:
+        if ts_str and ts_str[-3:-2] == ":":
             ts_str = ts_str[:-3]+ts_str[-2:]
-        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
-        info = {
+
+        if ts_str:
+            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
+        else:
+            time = datetime.now()
+
+        yield {
             'url': url,
             'timestamp': str(time.timestamp()),
-            'tags': tags or '',
             'title': title or None,
+            'tags': tags or '',
             'sources': [rss_file.name],
         }
-        info['type'] = get_link_type(info)
-        yield info
+
 
 def parse_medium_rss_export(rss_file):
     """Parse Medium RSS feed files into links"""
@@ -263,15 +260,14 @@ def parse_medium_rss_export(rss_file):
         title = item.find("title").text.strip()
         ts_str = item.find("pubDate").text
         time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
-        info = {
+        
+        yield {
             'url': url,
             'timestamp': str(time.timestamp()),
-            'tags': '',
             'title': title or None,
+            'tags': '',
             'sources': [rss_file.name],
         }
-        info['type'] = get_link_type(info)
-        yield info
 
 
 def parse_plain_text_export(text_file):
@@ -285,15 +281,15 @@ def parse_plain_text_export(text_file):
             
             for url in urls:
                 url = url.strip()
-                info = {
+                time = datetime.now()
+                
+                yield {
                     'url': url,
-                    'timestamp': str(datetime.now().timestamp()),
-                    'tags': '',
+                    'timestamp': str(time.timestamp()),
                     'title': None,
+                    'tags': '',
                     'sources': [text_file.name],
                 }
-                info['type'] = get_link_type(info)
-                yield info
 
 
 PARSERS = OrderedDict([
diff --git a/archivebox/templates/link_index.html b/archivebox/templates/link_index.html
index 29927ea3..9d286ae0 100644
--- a/archivebox/templates/link_index.html
+++ b/archivebox/templates/link_index.html
@@ -194,8 +194,8 @@
                     Last updated: <small title="Timestamp: $updated">$updated_date</small>
                 </div>
                 <div class="col-lg-4 alert well">
-                    Metadata: 
-                    <span class="badge badge-default">$type</span>
+                    Type: 
+                    <span class="badge badge-default">$extension</span>
                     &nbsp; | &nbsp;
                     Tags:
                     <span class="badge badge-success">$tags</span> 
diff --git a/archivebox/util.py b/archivebox/util.py
index 85648ee2..189de476 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -70,6 +70,26 @@ HTML_TITLE_REGEX = re.compile(
     r'(.[^<>]+)',                      # get everything up to these symbols
     re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
 )
+STATICFILE_EXTENSIONS = {
+    # 99.999% of the time, URLs ending in these extentions are static files
+    # that can be downloaded as-is, not html pages that need to be rendered
+    'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
+    'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
+    'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
+    'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
+    'atom', 'rss', 'css', 'js', 'json',
+    'dmg', 'iso', 'img',
+    'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
+
+    # Less common extensions to consider adding later
+    # jar, swf, bin, com, exe, dll, deb
+    # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, 
+    # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
+    # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
+
+    # Thse are always treated as pages, not as static files, never add them:
+    # html, htm, shtml, xhtml, xml, aspx, php, cgi
+}
 
 ### Checks & Tests
 
@@ -225,6 +245,7 @@ def save_remote_source(url, timeout=TIMEOUT):
 
 def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
     """Attempt to guess a page's title by downloading the html"""
+    
     if not FETCH_TITLE:
         return None
 
@@ -257,8 +278,8 @@ def wget_output_path(link):
 
     urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
 
-    if link['type'] in ('PDF', 'image'):
-        return urlencode(base_url(link['url']))
+    if is_static_file(link['url']):
+        return urlencode(without_scheme(without_fragment(link['url'])))
 
     # Since the wget algorithm to for -E (appending .html) is incredibly complex
     # instead of trying to emulate it here, we just look in the output folder
@@ -271,6 +292,18 @@ def wget_output_path(link):
         full_path,
     )
 
+    # Wget downloads can save in a number of different ways depending on the url
+    #    https://example.com
+    #       > output/archive/<timestamp>/example.com/index.html
+    #    https://example.com/abc
+    #       > output/archive/<timestamp>/example.com/abc.html
+    #    https://example.com/abc/
+    #       > output/archive/<timestamp>/example.com/abc/index.html
+    #    https://example.com/abc/test.html
+    #       > output/archive/<timestamp>/example.com/abc/test.html
+
+    # There's also lots of complexity around how the urlencoding and renaming
+    # is done for pages with query and hash fragments or extensions like shtml / htm
     for _ in range(4):
         if os.path.exists(search_dir):
             if os.path.isdir(search_dir):
@@ -279,8 +312,8 @@ def wget_output_path(link):
                     if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
                 ]
                 if html_files:
-                    relative_path = search_dir.split(link_dir)[-1].strip('/')
-                    return urlencode(os.path.join(relative_path, html_files[0]))
+                    path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
+                    return urlencode(os.path.join(path_from_link_dir, html_files[0]))
 
         # Move up one directory level
         search_dir = search_dir.rsplit('/', 1)[0]
@@ -327,19 +360,32 @@ def pretty_path(path):
     """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
     return path.replace(REPO_DIR + '/', '')
 
+
 def print_error_hints(cmd, pwd, err=None, hints=None, prefix='        '):
     """quote the argument with whitespace in a command so the user can 
        copy-paste the outputted string directly to run the cmd
     """
 
+    # Prettify CMD string and make it save to copy-paste by quoting arguments
     quoted_cmd = ' '.join(
         '"{}"'.format(arg) if ' ' in arg else arg
         for arg in cmd
     )
 
+    # Prettify error output hints string and limit to five lines
+    hints = hints or getattr(err, 'hints', None)
+    if hints:
+        hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
+        hints = (
+            '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
+            for line in hints[:5] if line.strip()
+        )
+    else:
+        hints = ()
+
     output_lines = [
         '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
-        '    {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
+        *hints,
         'Run to see full output:'        
         '    cd {};'.format(pwd),
         '    {}'.format(quoted_cmd),
@@ -364,36 +410,21 @@ def merge_links(a, b):
     url = longer('url')
     longest_title = longer('title')
     cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
-    link = {
-        'timestamp': earlier('timestamp'),
+    return {
         'url': url,
-        'domain': domain(url),
-        'base_url': base_url(url),
-        'tags': longer('tags'),
+        'timestamp': earlier('timestamp'),
         'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
+        'tags': longer('tags'),
         'sources': list(set(a.get('sources', []) + b.get('sources', []))),
     }
-    link['type'] = get_link_type(link)
-    return link
 
-def get_link_type(link):
-    """Certain types of links need to be handled specially, this figures out when that's the case"""
+def is_static_file(url):
+    """Certain URLs just point to a single static file, and 
+       don't need to be re-archived in many formats
+    """
 
-    if extension(link['url']) == 'pdf':
-        return 'PDF'
-    elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
-        return 'image'
-    elif 'wikipedia.org' in domain(link['url']).lower():
-        return 'wiki'
-    elif 'youtube.com' in domain(link['url']).lower():
-        return 'youtube'
-    elif 'soundcloud.com' in domain(link['url']).lower():
-        return 'soundcloud'
-    elif 'youku.com' in domain(link['url']).lower():
-        return 'youku'
-    elif 'vimeo.com' in domain(link['url']).lower():
-        return 'vimeo'
-    return None
+    # TODO: the proper way is with MIME type detection, not using extension
+    return extension(url) in STATICFILE_EXTENSIONS
 
 def derived_link_info(link):
     """extend link info with the archive urls and other derived data"""
@@ -410,7 +441,9 @@ def derived_link_info(link):
         'domain': domain(url),
         'path': path(url),
         'basename': basename(url),
+        'extension': extension(url),
         'base_url': base_url(url),
+        'is_static': is_static_file(url),
         'is_archived': os.path.exists(os.path.join(
             ARCHIVE_DIR,
             link['timestamp'],
@@ -420,8 +453,7 @@ def derived_link_info(link):
     }
 
     # Archive Method Output URLs
-    extended_info = {
-        **extended_info,
+    extended_info.update({
         'index_url': 'index.html',
         'favicon_url': 'favicon.ico',
         'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
@@ -433,14 +465,13 @@ def derived_link_info(link):
         'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
         'git_url': 'git',
         'media_url': 'media',
-        
-    }
-
-    # PDF and images are handled slightly differently
-    # wget, screenshot, & pdf urls all point to the same file
-    if link['type'] in ('PDF', 'image'):
+    })
+    # static binary files like PDF and images are handled slightly differently.
+    # they're just downloaded once and aren't archived separately multiple times, 
+    # so the wget, screenshot, & pdf urls should all point to the same file
+    if is_static_file(url):
         extended_info.update({
-            'title': basename(link['url']),
+            'title': basename(url),
             'archive_url': base_url(url),
             'pdf_url': base_url(url),
             'screenshot_url': base_url(url),