remove flawed link_type concept in favor of simpler staticfile detection

2024-06-28 19:10:33 +12:00 · 2019-03-20 21:11:29 -04:00 · 2019-03-20 21:11:29 -04:00 · 5ee1c39720
parent c79e1df8b2
commit 5ee1c39720
4 changed files with 107 additions and 79 deletions
--- a/archivebox/index.py
+++ b/archivebox/index.py
@ -224,6 +224,7 @@ def write_html_link_index(out_dir, link):
                wget_output_path(link)
                or (link['domain'] if link['is_archived'] else 'about:blank')
            ),
            'extension': link['extension'] or 'HTML',
        }))
    chmod_file(path)
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@ -10,8 +10,8 @@ Parsed link schema: {
    'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
    'timestamp': '15442123124234',
    'title': 'Example.com Page Title',
    'tags': 'abc,def',
    'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
    'tags': 'abc,def',
 }
 """
@ -25,7 +25,6 @@ import xml.etree.ElementTree as etree
 from config import ANSI
 from util import (
    str_between,
    get_link_type,
    URL_REGEX,
    check_url_parsing,
 )
@ -69,17 +68,18 @@ def parse_pocket_html_export(html_file):
        # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
        match = pattern.search(line)
        if match:
-            fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
+            url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
            time = datetime.fromtimestamp(float(match.group(2)))
-            info = {
+            tags = match.group(3)
-                'url': fixed_url,
+            title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
            yield {
                'url': url,
                'timestamp': str(time.timestamp()),
-                'tags': match.group(3),
+                'title': title or None,
-                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
+                'tags': tags or '',
                'sources': [html_file.name],
            }
            info['type'] = get_link_type(info)
            yield info
 def parse_pinboard_json_export(json_file):
    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
@ -106,14 +106,14 @@ def parse_pinboard_json_export(json_file):
                title = (erg.get('description') or '').replace(' — Readability', '')
            else:
                title = erg['title'].strip()
            info = {
                'url': url,
                'timestamp': timestamp,
                'tags': erg.get('tags') or '',
                'title': title or None,
                'tags': erg.get('tags') or '',
                'sources': [json_file.name],
            }
            info['type'] = get_link_type(info)
            yield info
@ -144,16 +144,13 @@ def parse_rss_export(rss_file):
        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
-        info = {
+        yield {
            'url': url,
            'timestamp': str(time.timestamp()),
            'tags': '',
            'title': title or None,
            'tags': '',
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
        yield info
 def parse_shaarli_rss_export(rss_file):
@ -184,16 +181,14 @@ def parse_shaarli_rss_export(rss_file):
        ts_str = str_between(get_row('published'), '<published>', '</published>')
        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
-        info = {
+        yield {
            'url': url,
            'timestamp': str(time.timestamp()),
            'tags': '',
            'title': title or None,
            'tags': '',
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
        yield info
 def parse_netscape_html_export(html_file):
    """Parse netscape-format bookmarks export files (produced by all browsers)"""
@ -209,16 +204,14 @@ def parse_netscape_html_export(html_file):
            url = match.group(1)
            time = datetime.fromtimestamp(float(match.group(2)))
-            info = {
+            yield {
                'url': url,
                'timestamp': str(time.timestamp()),
                'tags': "",
                'title': match.group(3).strip() or None,
                'tags': '',
                'sources': [html_file.name],
            }
            info['type'] = get_link_type(info)
            yield info
 def parse_pinboard_rss_export(rss_file):
    """Parse Pinboard RSS feed files into links"""
@ -237,18 +230,22 @@ def parse_pinboard_rss_export(rss_file):
        # Pinboard includes a colon in its date stamp timezone offsets, which
        # Python can't parse. Remove it:
-        if ":" == ts_str[-3:-2]:
+        if ts_str and ts_str[-3:-2] == ":":
            ts_str = ts_str[:-3]+ts_str[-2:]
-        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
+
-        info = {
+        if ts_str:
            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
        else:
            time = datetime.now()
        yield {
            'url': url,
            'timestamp': str(time.timestamp()),
            'tags': tags or '',
            'title': title or None,
            'tags': tags or '',
            'sources': [rss_file.name],
        }
-        info['type'] = get_link_type(info)
+
        yield info
 def parse_medium_rss_export(rss_file):
    """Parse Medium RSS feed files into links"""
@ -263,15 +260,14 @@ def parse_medium_rss_export(rss_file):
        title = item.find("title").text.strip()
        ts_str = item.find("pubDate").text
        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
-        info = {
+        
        yield {
            'url': url,
            'timestamp': str(time.timestamp()),
            'tags': '',
            'title': title or None,
            'tags': '',
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
        yield info
 def parse_plain_text_export(text_file):
@ -285,15 +281,15 @@ def parse_plain_text_export(text_file):
            for url in urls:
                url = url.strip()
-                info = {
+                time = datetime.now()
                yield {
                    'url': url,
-                    'timestamp': str(datetime.now().timestamp()),
+                    'timestamp': str(time.timestamp()),
                    'tags': '',
                    'title': None,
                    'tags': '',
                    'sources': [text_file.name],
                }
                info['type'] = get_link_type(info)
                yield info
 PARSERS = OrderedDict([
--- a/archivebox/templates/link_index.html
+++ b/archivebox/templates/link_index.html
@ -194,8 +194,8 @@
                    Last updated: <small title="Timestamp: $updated">$updated_date</small>
                </div>
                <div class="col-lg-4 alert well">
-                    Metadata: 
+                    Type: 
-                    <span class="badge badge-default">$type</span>
+                    <span class="badge badge-default">$extension</span>
                    &nbsp; | &nbsp;
                    Tags:
                    <span class="badge badge-success">$tags</span> 
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -70,6 +70,26 @@ HTML_TITLE_REGEX = re.compile(
    r'(.[^<>]+)',                      # get everything up to these symbols
    re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
 )
 STATICFILE_EXTENSIONS = {
    # 99.999% of the time, URLs ending in these extentions are static files
    # that can be downloaded as-is, not html pages that need to be rendered
    'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
    'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
    'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
    'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
    'atom', 'rss', 'css', 'js', 'json',
    'dmg', 'iso', 'img',
    'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
    # Less common extensions to consider adding later
    # jar, swf, bin, com, exe, dll, deb
    # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, 
    # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
    # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
    # Thse are always treated as pages, not as static files, never add them:
    # html, htm, shtml, xhtml, xml, aspx, php, cgi
 }
 ### Checks & Tests
@ -225,6 +245,7 @@ def save_remote_source(url, timeout=TIMEOUT):
 def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
    """Attempt to guess a page's title by downloading the html"""
    if not FETCH_TITLE:
        return None
@ -257,8 +278,8 @@ def wget_output_path(link):
    urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
-    if link['type'] in ('PDF', 'image'):
+    if is_static_file(link['url']):
-        return urlencode(base_url(link['url']))
+        return urlencode(without_scheme(without_fragment(link['url'])))
    # Since the wget algorithm to for -E (appending .html) is incredibly complex
    # instead of trying to emulate it here, we just look in the output folder
@ -271,6 +292,18 @@ def wget_output_path(link):
        full_path,
    )
    # Wget downloads can save in a number of different ways depending on the url
    #    https://example.com
    #       > output/archive/<timestamp>/example.com/index.html
    #    https://example.com/abc
    #       > output/archive/<timestamp>/example.com/abc.html
    #    https://example.com/abc/
    #       > output/archive/<timestamp>/example.com/abc/index.html
    #    https://example.com/abc/test.html
    #       > output/archive/<timestamp>/example.com/abc/test.html
    # There's also lots of complexity around how the urlencoding and renaming
    # is done for pages with query and hash fragments or extensions like shtml / htm
    for _ in range(4):
        if os.path.exists(search_dir):
            if os.path.isdir(search_dir):
@ -279,8 +312,8 @@ def wget_output_path(link):
                    if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
                ]
                if html_files:
-                    relative_path = search_dir.split(link_dir)[-1].strip('/')
+                    path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
-                    return urlencode(os.path.join(relative_path, html_files[0]))
+                    return urlencode(os.path.join(path_from_link_dir, html_files[0]))
        # Move up one directory level
        search_dir = search_dir.rsplit('/', 1)[0]
@ -327,19 +360,32 @@ def pretty_path(path):
    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
    return path.replace(REPO_DIR + '/', '')
 def print_error_hints(cmd, pwd, err=None, hints=None, prefix='        '):
    """quote the argument with whitespace in a command so the user can 
       copy-paste the outputted string directly to run the cmd
    """
    # Prettify CMD string and make it save to copy-paste by quoting arguments
    quoted_cmd = ' '.join(
        '"{}"'.format(arg) if ' ' in arg else arg
        for arg in cmd
    )
    # Prettify error output hints string and limit to five lines
    hints = hints or getattr(err, 'hints', None)
    if hints:
        hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
        hints = (
            '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
            for line in hints[:5] if line.strip()
        )
    else:
        hints = ()
    output_lines = [
        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
-        '    {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
+        *hints,
        'Run to see full output:'        
        '    cd {};'.format(pwd),
        '    {}'.format(quoted_cmd),
@ -364,36 +410,21 @@ def merge_links(a, b):
    url = longer('url')
    longest_title = longer('title')
    cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
-    link = {
+    return {
        'timestamp': earlier('timestamp'),
        'url': url,
-        'domain': domain(url),
+        'timestamp': earlier('timestamp'),
        'base_url': base_url(url),
        'tags': longer('tags'),
        'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
        'tags': longer('tags'),
        'sources': list(set(a.get('sources', []) + b.get('sources', []))),
    }
    link['type'] = get_link_type(link)
    return link
-def get_link_type(link):
+def is_static_file(url):
-    """Certain types of links need to be handled specially, this figures out when that's the case"""
+    """Certain URLs just point to a single static file, and 
       don't need to be re-archived in many formats
    """
-    if extension(link['url']) == 'pdf':
+    # TODO: the proper way is with MIME type detection, not using extension
-        return 'PDF'
+    return extension(url) in STATICFILE_EXTENSIONS
    elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
        return 'image'
    elif 'wikipedia.org' in domain(link['url']).lower():
        return 'wiki'
    elif 'youtube.com' in domain(link['url']).lower():
        return 'youtube'
    elif 'soundcloud.com' in domain(link['url']).lower():
        return 'soundcloud'
    elif 'youku.com' in domain(link['url']).lower():
        return 'youku'
    elif 'vimeo.com' in domain(link['url']).lower():
        return 'vimeo'
    return None
 def derived_link_info(link):
    """extend link info with the archive urls and other derived data"""
@ -410,7 +441,9 @@ def derived_link_info(link):
        'domain': domain(url),
        'path': path(url),
        'basename': basename(url),
        'extension': extension(url),
        'base_url': base_url(url),
        'is_static': is_static_file(url),
        'is_archived': os.path.exists(os.path.join(
            ARCHIVE_DIR,
            link['timestamp'],
@ -420,8 +453,7 @@ def derived_link_info(link):
    }
    # Archive Method Output URLs
-    extended_info = {
+    extended_info.update({
        **extended_info,
        'index_url': 'index.html',
        'favicon_url': 'favicon.ico',
        'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
@ -433,14 +465,13 @@ def derived_link_info(link):
        'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
        'git_url': 'git',
        'media_url': 'media',
-        
+    })
-    }
+    # static binary files like PDF and images are handled slightly differently.
-
+    # they're just downloaded once and aren't archived separately multiple times, 
-    # PDF and images are handled slightly differently
+    # so the wget, screenshot, & pdf urls should all point to the same file
-    # wget, screenshot, & pdf urls all point to the same file
+    if is_static_file(url):
    if link['type'] in ('PDF', 'image'):
        extended_info.update({
-            'title': basename(link['url']),
+            'title': basename(url),
            'archive_url': base_url(url),
            'pdf_url': base_url(url),
            'screenshot_url': base_url(url),