better link corruption guards, remove title prefetching, save index after run

2024-06-28 19:10:33 +12:00 · 2019-02-21 17:45:28 -05:00 · 2019-02-21 17:45:28 -05:00 · b03e9fade8
parent c95632883e
commit b03e9fade8
6 changed files with 165 additions and 93 deletions
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@ -7,34 +7,31 @@ import os
 import sys
 from datetime import datetime
-from subprocess import run
+from peekable import Peekable
 from parse import parse_links
-from links import validate_links
+from links import validate_links, links_after_timestamp
-from archive_methods import archive_links, _RESULTS_TOTALS
+from archive_methods import archive_link, _RESULTS_TOTALS
 from index import (
    write_links_index,
    write_link_index,
    parse_json_links_index,
    parse_json_link_index,
 )
 from config import (
    ARCHIVE_DIR,
    ONLY_NEW,
    OUTPUT_PERMISSIONS,
    OUTPUT_DIR,
    REPO_DIR,
    ANSI,
    TIMEOUT,
    SHOW_PROGRESS,
    GIT_SHA,
 )
 from util import (
    check_dependencies,
    download_url,
    save_source,
    progress,
    cleanup_archive,
    pretty_path,
    migrate_data,
    check_links_structure,
 )
 __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
@ -42,6 +39,7 @@ __VERSION__ = GIT_SHA
 __DESCRIPTION__ = 'ArchiveBox Usage:  Create a browsable html archive of a list of links.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
 def print_help():
    print(__DESCRIPTION__)
    print("Documentation:     {}\n".format(__DOCUMENTATION__))
@ -55,21 +53,22 @@ def print_help():
 def load_links(archive_path=OUTPUT_DIR, import_path=None):
    """get new links from file and optionally append them to links in existing archive"""
-    
+
    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
        check_links_structure(existing_links)
    new_links = []
    if import_path:
        # parse and validate the import file
        raw_links, parser_name = parse_links(import_path)
        new_links = validate_links(raw_links)
-        if SHOW_PROGRESS:
+        check_links_structure(new_links)
            print()
    # merge existing links in archive_path and new links
    all_links = validate_links(existing_links + new_links)
    check_links_structure(all_links)
    num_new_links = len(all_links) - len(existing_links)
    if import_path and parser_name:
@ -81,6 +80,7 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None):
    return all_links, new_links
 def update_archive(archive_path, links, source=None, resume=None, append=True):
    """update or create index.html+json given a path to an export file containing new links"""
@ -99,8 +99,38 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
             **ANSI,
        ))
    check_links_structure(links)
    # prefetch the first link off the generator so that if we pause or fail
    # immediately we can show that we paused on the first link and not just None
    to_archive = Peekable(links_after_timestamp(links, resume))
    idx, link = 0, to_archive.peek(0)
    # loop over links and archive them
-    archive_links(archive_path, links, source=source, resume=resume)
+    try:
        check_dependencies()
        for idx, link in enumerate(to_archive):
            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)
    except (KeyboardInterrupt, SystemExit, Exception) as e:
        print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
            **ANSI,
            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            idx=idx+1,
            timestamp=link['timestamp'],
            total=len(links),
        ))
        print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
        print('    Continue where you left off by running:')
        print('        {} {}'.format(
            pretty_path(sys.argv[0]),
            link['timestamp'],
        ))
        if not isinstance(e, KeyboardInterrupt):
            print()
            raise e
        raise SystemExit(1)
    # print timing information & summary
    end_ts = datetime.now().timestamp()
@ -135,7 +165,7 @@ if __name__ == '__main__':
    source = sys.argv[1] if argc > 1 else None  # path of links file to import
    resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
-    stdin_raw_text = []
+    stdin_raw_text = ''
    if not sys.stdin.isatty():
        stdin_raw_text = sys.stdin.read()
@ -192,3 +222,7 @@ if __name__ == '__main__':
        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
    else:
        update_archive(out_dir, all_links, source=source, resume=resume, append=True)
    # Step 5: Re-write links index with updated titles, icons, and resources
    all_links, _ = load_links(archive_path=out_dir)
    write_links_index(out_dir=out_dir, links=all_links)
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@ -1,16 +1,17 @@
 import os
 import re
 import sys
 from functools import wraps
 from collections import defaultdict
 from datetime import datetime
-from peekable import Peekable
+from index import (
-
+    wget_output_path,
-from index import wget_output_path, parse_json_link_index, write_link_index
+    parse_json_link_index,
-from links import links_after_timestamp
+    write_link_index,
    patch_index_title_hack,
 )
 from config import (
    OUTPUT_DIR,
    CURL_BINARY,
    GIT_BINARY,
    WGET_BINARY,
@ -42,12 +43,12 @@ from config import (
 )
 from util import (
    without_fragment,
    check_dependencies,
    fetch_page_title,
    progress,
    chmod_file,
    pretty_path,
-    run, PIPE, DEVNULL
+    check_link_structure,
    run, PIPE, DEVNULL,
 )
@ -57,38 +58,12 @@ _RESULTS_TOTALS = {   # globals are bad, mmkay
    'failed': 0,
 }
 def archive_links(archive_path, links, source=None, resume=None):
    check_dependencies()
    to_archive = Peekable(links_after_timestamp(links, resume))
    idx, link = 0, to_archive.peek(0)
    try:
        for idx, link in enumerate(to_archive):
            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)
    except (KeyboardInterrupt, SystemExit, Exception) as e:
        print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
            **ANSI,
            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            idx=idx+1,
            timestamp=link['timestamp'],
            total=len(links),
        ))
        print('    Continue where you left off by running:')
        print('        {} {}'.format(
            pretty_path(sys.argv[0]),
            link['timestamp'],
        ))
        if not isinstance(e, KeyboardInterrupt):
            raise e
        raise SystemExit(1)
 def archive_link(link_dir, link, overwrite=True):
    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
    check_link_structure(link)
    try:
        update_existing = os.path.exists(link_dir)
        if update_existing:
@ -99,7 +74,7 @@ def archive_link(link_dir, link, overwrite=True):
        else:
            os.makedirs(link_dir)
-        log_link_archive(link_dir, link, update_existing)
+        print_link_status_line(link_dir, link, update_existing)
        if FETCH_FAVICON:
            link = fetch_favicon(link_dir, link, overwrite=overwrite)
@ -135,7 +110,7 @@ def archive_link(link_dir, link, overwrite=True):
    return link
-def log_link_archive(link_dir, link, update_existing):
+def print_link_status_line(link_dir, link, update_existing):
    print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
        symbol='*' if update_existing else '+',
        symbol_color=ANSI['black' if update_existing else 'green'],
@ -518,7 +493,7 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
    # if link already has valid title, skip it
    if link['title'] and not link['title'].lower().startswith('http'):
-        return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
+        return {'output': link['title'], 'status': 'skipped'}
    end = progress(timeout, prefix='      ')
    try:
@ -530,6 +505,13 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
        output = e
    # titles should show up in the global index immediatley for better UX,
    # do a hacky immediate replacement to add them in as we're archiving
    # TODO: figure out how to do this without gnarly string replacement
    if title:
        link['title'] = title
        patch_index_title_hack(link['url'], title)
    return {
        'cmd': 'fetch_page_title("{}")'.format(link['url']),
        'output': output,
--- a/archivebox/index.py
+++ b/archivebox/index.py
@ -6,6 +6,7 @@ from string import Template
 from distutils.dir_util import copy_tree
 from config import (
    OUTPUT_DIR,
    TEMPLATES_DIR,
    OUTPUT_PERMISSIONS,
    ANSI,
@ -17,6 +18,8 @@ from util import (
    wget_output_path,
    derived_link_info,
    pretty_path,
    check_link_structure,
    check_links_structure,
 )
@ -25,6 +28,8 @@ from util import (
 def write_links_index(out_dir, links):
    """create index.html file for a given list of links"""
    check_links_structure(links)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
@ -42,6 +47,8 @@ def write_links_index(out_dir, links):
 def write_json_links_index(out_dir, links):
    """write the json link index to a given path"""
    check_links_structure(links)
    path = os.path.join(out_dir, 'index.json')
    index_json = {
@ -63,13 +70,17 @@ def parse_json_links_index(out_dir):
    index_path = os.path.join(out_dir, 'index.json')
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
-            return json.load(f)['links']
+            links = json.load(f)['links']
            check_links_structure(links)
            return links
    return []
 def write_html_links_index(out_dir, links):
    """write the html link index to a given path"""
    check_links_structure(links)
    path = os.path.join(out_dir, 'index.html')
    copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
@ -104,6 +115,25 @@ def write_html_links_index(out_dir, links):
    chmod_file(path)
 def patch_index_title_hack(link_url, new_title):
    """hack to update just one link's title in the link index json"""
    json_path = os.path.join(OUTPUT_DIR, 'index.json')
    links = parse_json_links_index(OUTPUT_DIR)
    changed = False
    for link in links:
        if link['url'] == link_url:
            link['title'] = new_title
            changed = True
            break
    if changed:
        write_json_links_index(OUTPUT_DIR, links)
 ### Individual link index
 def write_link_index(out_dir, link):
@ -114,6 +144,7 @@ def write_link_index(out_dir, link):
 def write_json_link_index(out_dir, link):
    """write a json file with some info about the link"""
    check_link_structure(link)
    path = os.path.join(out_dir, 'index.json')
    print('      √ index.json')
@ -128,10 +159,13 @@ def parse_json_link_index(out_dir):
    existing_index = os.path.join(out_dir, 'index.json')
    if os.path.exists(existing_index):
        with open(existing_index, 'r', encoding='utf-8') as f:
-            return json.load(f)
+            link_json = json.load(f)
            check_link_structure(link_json)
            return link_json
    return {}
 def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()
--- a/archivebox/links.py
+++ b/archivebox/links.py
@ -32,34 +32,33 @@ Link {
 """
 import datetime
 from html import unescape
 from collections import OrderedDict
 from util import (
    domain,
    base_url,
    str_between,
    get_link_type,
    merge_links,
    wget_output_path,
    check_link_structure,
    check_links_structure,
 )
 from config import ANSI
 def validate_links(links):
    check_links_structure(links)
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    links = sorted_links(links)      # deterministically sort the links based on timstamp, url
-    
+
    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)
    for link in links:
        check_link_structure(link)
        link['title'] = unescape(link['title']) if link['title'] else None
        link['latest'] = link.get('latest') or {}
-        
+
        latest = link['latest']
        if not link['latest'].get('wget'):
            link['latest']['wget'] = wget_output_path(link)
@ -81,14 +80,16 @@ def validate_links(links):
    return list(links)
 def archivable_links(links):
    """remove chrome://, about:// or other schemed links that cant be archived"""
    return (
        link
        for link in links
-        if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
+        if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://'))
    )
 def uniquefied_links(sorted_links):
    """
    ensures that all non-duplicate links have monotonically increasing timestamps
@ -114,10 +115,12 @@ def uniquefied_links(sorted_links):
    return unique_timestamps.values()
 def sorted_links(links):
    sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
    return sorted(links, key=sort_func, reverse=True)
 def links_after_timestamp(links, timestamp=None):
    if not timestamp:
        yield from links
@ -130,6 +133,7 @@ def links_after_timestamp(links, timestamp=None):
        except (ValueError, TypeError):
            print('Resume value and all timestamp values must be valid numbers.')
 def lowest_uniq_timestamp(used_timestamps, timestamp):
    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@ -20,7 +20,6 @@ Parsed link schema: {
 import re
 import sys
 import json
 import urllib
 from collections import OrderedDict
 import xml.etree.ElementTree as etree
@ -32,7 +31,6 @@ from util import (
    base_url,
    str_between,
    get_link_type,
    fetch_page_title,
    URL_REGEX,
 )
@ -56,13 +54,11 @@ def parse_links(path):
    links = []
    with open(path, 'r', encoding='utf-8') as file:
-        print('{green}[*] [{}] Parsing new links from output/sources/{} and fetching titles...{reset}'.format(
+        print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            path.rsplit('/', 1)[-1],
            **ANSI,
        ))
        if SHOW_PROGRESS:
            sys.stdout.write('    ')
        for parser_name, parser_func in get_parsers(file).items():
            # otherwise try all parsers until one works
@ -98,7 +94,7 @@ def parse_pocket_html_export(html_file):
                'base_url': base_url(fixed_url),
                'timestamp': str(time.timestamp()),
                'tags': match.group(3),
-                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or fetch_page_title(fixed_url),
+                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
                'sources': [html_file.name],
            }
            info['type'] = get_link_type(info)
@ -135,7 +131,7 @@ def parse_pinboard_json_export(json_file):
                'base_url': base_url(url),
                'timestamp': timestamp,
                'tags': erg.get('tags') or '',
-                'title': title or fetch_page_title(url),
+                'title': title or None,
                'sources': [json_file.name],
            }
            info['type'] = get_link_type(info)
@ -174,7 +170,7 @@ def parse_rss_export(rss_file):
            'base_url': base_url(url),
            'timestamp': str(time.timestamp()),
            'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
@ -215,7 +211,7 @@ def parse_shaarli_rss_export(rss_file):
            'base_url': base_url(url),
            'timestamp': str(time.timestamp()),
            'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
@ -242,7 +238,7 @@ def parse_netscape_html_export(html_file):
                'base_url': base_url(url),
                'timestamp': str(time.timestamp()),
                'tags': "",
-                'title': match.group(3).strip() or fetch_page_title(url),
+                'title': match.group(3).strip() or None,
                'sources': [html_file.name],
            }
            info['type'] = get_link_type(info)
@ -275,7 +271,7 @@ def parse_pinboard_rss_export(rss_file):
            'base_url': base_url(url),
            'timestamp': str(time.timestamp()),
            'tags': tags,
-            'title': title or fetch_page_title(url),
+            'title': title or None,
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
@ -300,7 +296,7 @@ def parse_medium_rss_export(rss_file):
            'base_url': base_url(url),
            'timestamp': str(time.timestamp()),
            'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
@ -324,7 +320,7 @@ def parse_plain_text_export(text_file):
                    'base_url': base_url(url),
                    'timestamp': str(datetime.now().timestamp()),
                    'tags': '',
-                    'title': fetch_page_title(url),
+                    'title': None,
                    'sources': [text_file.name],
                }
                info['type'] = get_link_type(info)
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -3,8 +3,7 @@ import re
 import sys
 import time
 import json
-import signal
+from urllib.request import Request, urlopen
 from urllib.request import urlopen
 from urllib.parse import urlparse
 from decimal import Decimal
@ -25,6 +24,7 @@ from config import (
    TIMEOUT,
    SHOW_PROGRESS,
    CHECK_SSL_VALIDITY,
    WGET_USER_AGENT,
    CURL_BINARY,
    WGET_BINARY,
    CHROME_BINARY,
@ -219,7 +219,21 @@ def save_source(raw_text):
    return source_path
-def download_url(url):
+def fetch_page_content(url, timeout=TIMEOUT):
    req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
    if CHECK_SSL_VALIDITY:
        resp = urlopen(req, timeout=timeout)
    else:
        import ssl
        insecure = ssl._create_unverified_context()
        resp = urlopen(req, timeout=timeout, context=insecure)
    encoding = resp.headers.get_content_charset() or 'utf-8'
    return resp.read().decode(encoding)
 def download_url(url, timeout=TIMEOUT):
    """download a given url's content into downloads/domain.txt"""
    if not os.path.exists(SOURCES_DIR):
@ -236,7 +250,7 @@ def download_url(url):
    ))
    end = progress(TIMEOUT, prefix='      ')
    try:
-        downloaded_xml = urlopen(url).read().decode('utf-8')
+        downloaded_xml = fetch_page_content(url, timeout=timeout)
        end()
    except Exception as e:
        end()
@ -260,19 +274,15 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
            sys.stdout.write('.')
            sys.stdout.flush()
-        if CHECK_SSL_VALIDITY:
+        html = fetch_page_content(url, timeout=timeout)
            html_content = urlopen(url, timeout=timeout)
        else:
            try:
                import ssl
                insecure = ssl._create_unverified_context()
                html_content = urlopen(url, timeout=timeout, context=insecure)
            except ImportError:
                html_content = urlopen(url, timeout=timeout)
-        match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8'))
+        match = re.search(HTML_TITLE_REGEX, html)
        return match.group(1).strip() if match else None
-    except Exception:
+    except Exception as err:
        # print('[!] Failed to fetch title because of {}: {}'.format(
        #     err.__class__.__name__,
        #     err,
        # ))
        return None
@ -603,3 +613,15 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
            raise CalledProcessError(retcode, process.args,
                                     output=stdout, stderr=stderr)
    return CompletedProcess(process.args, retcode, stdout, stderr)
 def check_link_structure(link):
    assert isinstance(link, dict)
    assert isinstance(link.get('url'), str)
    assert len(link['url']) > 2
 def check_links_structure(links):
    assert isinstance(links, list)
    if links:
        check_link_structure(links[0])