diff --git a/archivebox/archive.py b/archivebox/archive.py index e5056cf9..efe23a9f 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -7,34 +7,31 @@ import os import sys from datetime import datetime -from subprocess import run +from peekable import Peekable + from parse import parse_links -from links import validate_links -from archive_methods import archive_links, _RESULTS_TOTALS +from links import validate_links, links_after_timestamp +from archive_methods import archive_link, _RESULTS_TOTALS from index import ( write_links_index, - write_link_index, parse_json_links_index, - parse_json_link_index, ) from config import ( + ARCHIVE_DIR, ONLY_NEW, - OUTPUT_PERMISSIONS, OUTPUT_DIR, REPO_DIR, ANSI, - TIMEOUT, - SHOW_PROGRESS, GIT_SHA, ) from util import ( + check_dependencies, download_url, save_source, - progress, - cleanup_archive, pretty_path, migrate_data, + check_links_structure, ) __AUTHOR__ = 'Nick Sweeting ' @@ -42,6 +39,7 @@ __VERSION__ = GIT_SHA __DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.' __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' + def print_help(): print(__DESCRIPTION__) print("Documentation: {}\n".format(__DOCUMENTATION__)) @@ -55,21 +53,22 @@ def print_help(): def load_links(archive_path=OUTPUT_DIR, import_path=None): """get new links from file and optionally append them to links in existing archive""" - + existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) + check_links_structure(existing_links) new_links = [] if import_path: # parse and validate the import file raw_links, parser_name = parse_links(import_path) new_links = validate_links(raw_links) - if SHOW_PROGRESS: - print() + check_links_structure(new_links) # merge existing links in archive_path and new links all_links = validate_links(existing_links + new_links) + check_links_structure(all_links) num_new_links = len(all_links) - len(existing_links) if import_path and parser_name: @@ -81,6 +80,7 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None): return all_links, new_links + def update_archive(archive_path, links, source=None, resume=None, append=True): """update or create index.html+json given a path to an export file containing new links""" @@ -99,8 +99,38 @@ def update_archive(archive_path, links, source=None, resume=None, append=True): **ANSI, )) + check_links_structure(links) + + # prefetch the first link off the generator so that if we pause or fail + # immediately we can show that we paused on the first link and not just None + to_archive = Peekable(links_after_timestamp(links, resume)) + idx, link = 0, to_archive.peek(0) + # loop over links and archive them - archive_links(archive_path, links, source=source, resume=resume) + try: + check_dependencies() + for idx, link in enumerate(to_archive): + link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) + archive_link(link_dir, link) + + except (KeyboardInterrupt, SystemExit, Exception) as e: + print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format( + **ANSI, + now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + idx=idx+1, + timestamp=link['timestamp'], + total=len(links), + )) + print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) + print(' Continue where you left off by running:') + print(' {} {}'.format( + pretty_path(sys.argv[0]), + link['timestamp'], + )) + if not isinstance(e, KeyboardInterrupt): + print() + raise e + raise SystemExit(1) # print timing information & summary end_ts = datetime.now().timestamp() @@ -135,7 +165,7 @@ if __name__ == '__main__': source = sys.argv[1] if argc > 1 else None # path of links file to import resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from - stdin_raw_text = [] + stdin_raw_text = '' if not sys.stdin.isatty(): stdin_raw_text = sys.stdin.read() @@ -192,3 +222,7 @@ if __name__ == '__main__': update_archive(out_dir, new_links, source=source, resume=resume, append=True) else: update_archive(out_dir, all_links, source=source, resume=resume, append=True) + + # Step 5: Re-write links index with updated titles, icons, and resources + all_links, _ = load_links(archive_path=out_dir) + write_links_index(out_dir=out_dir, links=all_links) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index d085871d..47927937 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -1,16 +1,17 @@ import os -import re -import sys from functools import wraps from collections import defaultdict from datetime import datetime -from peekable import Peekable - -from index import wget_output_path, parse_json_link_index, write_link_index -from links import links_after_timestamp +from index import ( + wget_output_path, + parse_json_link_index, + write_link_index, + patch_index_title_hack, +) from config import ( + OUTPUT_DIR, CURL_BINARY, GIT_BINARY, WGET_BINARY, @@ -42,12 +43,12 @@ from config import ( ) from util import ( without_fragment, - check_dependencies, fetch_page_title, progress, chmod_file, pretty_path, - run, PIPE, DEVNULL + check_link_structure, + run, PIPE, DEVNULL, ) @@ -57,38 +58,12 @@ _RESULTS_TOTALS = { # globals are bad, mmkay 'failed': 0, } -def archive_links(archive_path, links, source=None, resume=None): - check_dependencies() - - to_archive = Peekable(links_after_timestamp(links, resume)) - idx, link = 0, to_archive.peek(0) - - try: - for idx, link in enumerate(to_archive): - link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) - archive_link(link_dir, link) - - except (KeyboardInterrupt, SystemExit, Exception) as e: - print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format( - **ANSI, - now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - idx=idx+1, - timestamp=link['timestamp'], - total=len(links), - )) - print(' Continue where you left off by running:') - print(' {} {}'.format( - pretty_path(sys.argv[0]), - link['timestamp'], - )) - if not isinstance(e, KeyboardInterrupt): - raise e - raise SystemExit(1) - def archive_link(link_dir, link, overwrite=True): """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + check_link_structure(link) + try: update_existing = os.path.exists(link_dir) if update_existing: @@ -99,7 +74,7 @@ def archive_link(link_dir, link, overwrite=True): else: os.makedirs(link_dir) - log_link_archive(link_dir, link, update_existing) + print_link_status_line(link_dir, link, update_existing) if FETCH_FAVICON: link = fetch_favicon(link_dir, link, overwrite=overwrite) @@ -135,7 +110,7 @@ def archive_link(link_dir, link, overwrite=True): return link -def log_link_archive(link_dir, link, update_existing): +def print_link_status_line(link_dir, link, update_existing): print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format( symbol='*' if update_existing else '+', symbol_color=ANSI['black' if update_existing else 'green'], @@ -518,7 +493,7 @@ def fetch_title(link_dir, link, timeout=TIMEOUT): # if link already has valid title, skip it if link['title'] and not link['title'].lower().startswith('http'): - return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])} + return {'output': link['title'], 'status': 'skipped'} end = progress(timeout, prefix=' ') try: @@ -530,6 +505,13 @@ def fetch_title(link_dir, link, timeout=TIMEOUT): print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) output = e + # titles should show up in the global index immediatley for better UX, + # do a hacky immediate replacement to add them in as we're archiving + # TODO: figure out how to do this without gnarly string replacement + if title: + link['title'] = title + patch_index_title_hack(link['url'], title) + return { 'cmd': 'fetch_page_title("{}")'.format(link['url']), 'output': output, diff --git a/archivebox/index.py b/archivebox/index.py index 26811995..1aa661e9 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -6,6 +6,7 @@ from string import Template from distutils.dir_util import copy_tree from config import ( + OUTPUT_DIR, TEMPLATES_DIR, OUTPUT_PERMISSIONS, ANSI, @@ -17,6 +18,8 @@ from util import ( wget_output_path, derived_link_info, pretty_path, + check_link_structure, + check_links_structure, ) @@ -25,6 +28,8 @@ from util import ( def write_links_index(out_dir, links): """create index.html file for a given list of links""" + check_links_structure(links) + if not os.path.exists(out_dir): os.makedirs(out_dir) @@ -42,6 +47,8 @@ def write_links_index(out_dir, links): def write_json_links_index(out_dir, links): """write the json link index to a given path""" + check_links_structure(links) + path = os.path.join(out_dir, 'index.json') index_json = { @@ -63,13 +70,17 @@ def parse_json_links_index(out_dir): index_path = os.path.join(out_dir, 'index.json') if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: - return json.load(f)['links'] + links = json.load(f)['links'] + check_links_structure(links) + return links return [] def write_html_links_index(out_dir, links): """write the html link index to a given path""" + check_links_structure(links) + path = os.path.join(out_dir, 'index.html') copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static')) @@ -104,6 +115,25 @@ def write_html_links_index(out_dir, links): chmod_file(path) +def patch_index_title_hack(link_url, new_title): + """hack to update just one link's title in the link index json""" + + json_path = os.path.join(OUTPUT_DIR, 'index.json') + + links = parse_json_links_index(OUTPUT_DIR) + + changed = False + for link in links: + if link['url'] == link_url: + link['title'] = new_title + changed = True + break + + if changed: + write_json_links_index(OUTPUT_DIR, links) + + + ### Individual link index def write_link_index(out_dir, link): @@ -114,6 +144,7 @@ def write_link_index(out_dir, link): def write_json_link_index(out_dir, link): """write a json file with some info about the link""" + check_link_structure(link) path = os.path.join(out_dir, 'index.json') print(' √ index.json') @@ -128,10 +159,13 @@ def parse_json_link_index(out_dir): existing_index = os.path.join(out_dir, 'index.json') if os.path.exists(existing_index): with open(existing_index, 'r', encoding='utf-8') as f: - return json.load(f) + link_json = json.load(f) + check_link_structure(link_json) + return link_json return {} def write_html_link_index(out_dir, link): + check_link_structure(link) with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'), 'r', encoding='utf-8') as f: link_html = f.read() diff --git a/archivebox/links.py b/archivebox/links.py index d832a012..a83333f3 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -32,34 +32,33 @@ Link { """ -import datetime from html import unescape from collections import OrderedDict from util import ( - domain, - base_url, - str_between, - get_link_type, merge_links, wget_output_path, + check_link_structure, + check_links_structure, ) -from config import ANSI def validate_links(links): + check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links(links) # deterministically sort the links based on timstamp, url - + if not links: print('[X] No links found :(') raise SystemExit(1) for link in links: + check_link_structure(link) + link['title'] = unescape(link['title']) if link['title'] else None link['latest'] = link.get('latest') or {} - + latest = link['latest'] if not link['latest'].get('wget'): link['latest']['wget'] = wget_output_path(link) @@ -81,14 +80,16 @@ def validate_links(links): return list(links) + def archivable_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" return ( link for link in links - if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://')) + if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://')) ) + def uniquefied_links(sorted_links): """ ensures that all non-duplicate links have monotonically increasing timestamps @@ -114,10 +115,12 @@ def uniquefied_links(sorted_links): return unique_timestamps.values() + def sorted_links(links): sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url']) return sorted(links, key=sort_func, reverse=True) + def links_after_timestamp(links, timestamp=None): if not timestamp: yield from links @@ -130,6 +133,7 @@ def links_after_timestamp(links, timestamp=None): except (ValueError, TypeError): print('Resume value and all timestamp values must be valid numbers.') + def lowest_uniq_timestamp(used_timestamps, timestamp): """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" diff --git a/archivebox/parse.py b/archivebox/parse.py index 89ac2f94..ee8865f0 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -20,7 +20,6 @@ Parsed link schema: { import re import sys import json -import urllib from collections import OrderedDict import xml.etree.ElementTree as etree @@ -32,7 +31,6 @@ from util import ( base_url, str_between, get_link_type, - fetch_page_title, URL_REGEX, ) @@ -56,13 +54,11 @@ def parse_links(path): links = [] with open(path, 'r', encoding='utf-8') as file: - print('{green}[*] [{}] Parsing new links from output/sources/{} and fetching titles...{reset}'.format( + print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), path.rsplit('/', 1)[-1], **ANSI, )) - if SHOW_PROGRESS: - sys.stdout.write(' ') for parser_name, parser_func in get_parsers(file).items(): # otherwise try all parsers until one works @@ -98,7 +94,7 @@ def parse_pocket_html_export(html_file): 'base_url': base_url(fixed_url), 'timestamp': str(time.timestamp()), 'tags': match.group(3), - 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or fetch_page_title(fixed_url), + 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None, 'sources': [html_file.name], } info['type'] = get_link_type(info) @@ -135,7 +131,7 @@ def parse_pinboard_json_export(json_file): 'base_url': base_url(url), 'timestamp': timestamp, 'tags': erg.get('tags') or '', - 'title': title or fetch_page_title(url), + 'title': title or None, 'sources': [json_file.name], } info['type'] = get_link_type(info) @@ -174,7 +170,7 @@ def parse_rss_export(rss_file): 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', - 'title': title or fetch_page_title(url), + 'title': title or None, 'sources': [rss_file.name], } info['type'] = get_link_type(info) @@ -215,7 +211,7 @@ def parse_shaarli_rss_export(rss_file): 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', - 'title': title or fetch_page_title(url), + 'title': title or None, 'sources': [rss_file.name], } info['type'] = get_link_type(info) @@ -242,7 +238,7 @@ def parse_netscape_html_export(html_file): 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': "", - 'title': match.group(3).strip() or fetch_page_title(url), + 'title': match.group(3).strip() or None, 'sources': [html_file.name], } info['type'] = get_link_type(info) @@ -275,7 +271,7 @@ def parse_pinboard_rss_export(rss_file): 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': tags, - 'title': title or fetch_page_title(url), + 'title': title or None, 'sources': [rss_file.name], } info['type'] = get_link_type(info) @@ -300,7 +296,7 @@ def parse_medium_rss_export(rss_file): 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', - 'title': title or fetch_page_title(url), + 'title': title or None, 'sources': [rss_file.name], } info['type'] = get_link_type(info) @@ -324,7 +320,7 @@ def parse_plain_text_export(text_file): 'base_url': base_url(url), 'timestamp': str(datetime.now().timestamp()), 'tags': '', - 'title': fetch_page_title(url), + 'title': None, 'sources': [text_file.name], } info['type'] = get_link_type(info) diff --git a/archivebox/util.py b/archivebox/util.py index 6b40b572..e369d0de 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -3,8 +3,7 @@ import re import sys import time import json -import signal -from urllib.request import urlopen +from urllib.request import Request, urlopen from urllib.parse import urlparse from decimal import Decimal @@ -25,6 +24,7 @@ from config import ( TIMEOUT, SHOW_PROGRESS, CHECK_SSL_VALIDITY, + WGET_USER_AGENT, CURL_BINARY, WGET_BINARY, CHROME_BINARY, @@ -219,7 +219,21 @@ def save_source(raw_text): return source_path -def download_url(url): +def fetch_page_content(url, timeout=TIMEOUT): + req = Request(url, headers={'User-Agent': WGET_USER_AGENT}) + + if CHECK_SSL_VALIDITY: + resp = urlopen(req, timeout=timeout) + else: + import ssl + insecure = ssl._create_unverified_context() + resp = urlopen(req, timeout=timeout, context=insecure) + + encoding = resp.headers.get_content_charset() or 'utf-8' + return resp.read().decode(encoding) + + +def download_url(url, timeout=TIMEOUT): """download a given url's content into downloads/domain.txt""" if not os.path.exists(SOURCES_DIR): @@ -236,7 +250,7 @@ def download_url(url): )) end = progress(TIMEOUT, prefix=' ') try: - downloaded_xml = urlopen(url).read().decode('utf-8') + downloaded_xml = fetch_page_content(url, timeout=timeout) end() except Exception as e: end() @@ -260,19 +274,15 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): sys.stdout.write('.') sys.stdout.flush() - if CHECK_SSL_VALIDITY: - html_content = urlopen(url, timeout=timeout) - else: - try: - import ssl - insecure = ssl._create_unverified_context() - html_content = urlopen(url, timeout=timeout, context=insecure) - except ImportError: - html_content = urlopen(url, timeout=timeout) + html = fetch_page_content(url, timeout=timeout) - match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8')) + match = re.search(HTML_TITLE_REGEX, html) return match.group(1).strip() if match else None - except Exception: + except Exception as err: + # print('[!] Failed to fetch title because of {}: {}'.format( + # err.__class__.__name__, + # err, + # )) return None @@ -603,3 +613,15 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, raise CalledProcessError(retcode, process.args, output=stdout, stderr=stderr) return CompletedProcess(process.args, retcode, stdout, stderr) + + +def check_link_structure(link): + assert isinstance(link, dict) + assert isinstance(link.get('url'), str) + assert len(link['url']) > 2 + + +def check_links_structure(links): + assert isinstance(links, list) + if links: + check_link_structure(links[0])