diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 20e35c28..57488e20 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -7,7 +7,7 @@ from datetime import datetime from index import ( parse_json_link_index, write_link_index, - patch_index_title_hack, + update_main_index, ) from config import ( CURL_BINARY, @@ -103,7 +103,9 @@ def archive_link(link_dir, link, overwrite=True): for archive_method in active_methods: archive_method(link_dir, link, overwrite=overwrite) + write_link_index(link_dir, link) + update_main_index(link) except Exception as err: print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) @@ -218,7 +220,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) end() - output = wget_output_path(link, look_in=domain_dir) + output = wget_output_path(link) output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()] @@ -391,11 +393,13 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): output = 'archive.org.txt' archive_org_url = None + path = os.path.join(link_dir, output) if os.path.exists(path): archive_org_url = open(path, 'r').read().strip() return {'output': archive_org_url, 'status': 'skipped'} + submit_url = 'https://web.archive.org/save/{}'.format(link['url']) CMD = [ CURL_BINARY, @@ -412,7 +416,6 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): end() content_location, errors = parse_archive_dot_org_response(result.stdout) - if content_location: archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: @@ -427,6 +430,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): output = e print_error_hints(cmd=CMD, pwd=link_dir, err=e) + if not isinstance(output, Exception): # instead of writing None when archive.org rejects the url write the # url to resubmit it to archive.org. This is so when the user visits @@ -499,7 +503,6 @@ def fetch_title(link_dir, link, timeout=TIMEOUT): # TODO: figure out how to do this without gnarly string replacement if title: link['title'] = title - patch_index_title_hack(link['url'], title) return { 'cmd': 'fetch_page_title("{}")'.format(link['url']), diff --git a/archivebox/index.py b/archivebox/index.py index 8b4e8a4e..dc6cf1bc 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -22,8 +22,11 @@ from util import ( pretty_path, check_link_structure, check_links_structure, + wget_output_path, ) +TITLE_LOADING_MSG = 'Not yet archived...' + ### Homepage index for all the links @@ -96,9 +99,20 @@ def write_html_links_index(out_dir, links, finished=False): with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f: link_row_html = f.read() + full_links_info = (derived_link_info(link) for link in links) + link_rows = '\n'.join( - Template(link_row_html).substitute(**derived_link_info(link)) - for link in links + Template(link_row_html).substitute(**{ + **link, + 'title': ( + link['title'] + or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) + ), + 'archive_url': ( + wget_output_path(link) or 'index.html' + ), + }) + for link in full_links_info ) template_vars = { @@ -118,24 +132,41 @@ def write_html_links_index(out_dir, links, finished=False): chmod_file(path) -def patch_index_title_hack(link_url, new_title): - """hack to update just one link's title in the link index json""" +def update_main_index(link): + """hack to in-place update one row's info in the generated index html""" + title = link['latest']['title'] + successful = len([entry for entry in link['latest'].values() if entry]) + + # Patch JSON index json_path = os.path.join(OUTPUT_DIR, 'index.json') links = parse_json_links_index(OUTPUT_DIR) changed = False - for link in links: - if link['url'] == link_url: - link['title'] = new_title + for json_link in links: + if json_link['url'] == link['url']: + json_link['title'] = title + json_link['latest'] = link['latest'] changed = True break if changed: write_json_links_index(OUTPUT_DIR, links) + # Patch HTML index + html_path = os.path.join(OUTPUT_DIR, 'index.html') + html = open(html_path, 'r').read().split('\n') + for idx, line in enumerate(html): + if title and ('{}'.format(title) + elif successful and ('{}'.format(successful) + break + + with open(html_path, 'w') as f: + f.write('\n'.join(html)) ### Individual link index @@ -176,10 +207,19 @@ def write_html_link_index(out_dir, link): print(' √ index.html') + link = derived_link_info(link) + with open(path, 'w', encoding='utf-8') as f: f.write(Template(link_html).substitute({ - **derived_link_info(link), - # **link['latest'], + **link, + 'title': ( + link['title'] + or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) + ), + 'archive_url': ( + wget_output_path(link) + or (link['domain'] if link['is_archived'] else 'about:blank') + ), })) chmod_file(path) diff --git a/archivebox/templates/index.html b/archivebox/templates/index.html index 13777ce3..db662ecc 100644 --- a/archivebox/templates/index.html +++ b/archivebox/templates/index.html @@ -98,6 +98,28 @@ overflow-y: scroll; table-layout: fixed; } + table tr a span[data-archived~=False] { + opacity: 0.2; + } + .files-spinner { + height: 15px; + width: auto; + opacity: 0.5; + vertical-align: -2px; + } + .link-favicon { + padding-right: 8px; + vertical-align: -4px; + } + .in-progress { + display: none; + } + body[data-status~=finished] .files-spinner { + display: none; + } + body[data-status~=running] .in-progress { + display: inline-block; + }
@@ -121,12 +143,8 @@