diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 20e35c28..57488e20 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -7,7 +7,7 @@ from datetime import datetime from index import ( parse_json_link_index, write_link_index, - patch_index_title_hack, + update_main_index, ) from config import ( CURL_BINARY, @@ -103,7 +103,9 @@ def archive_link(link_dir, link, overwrite=True): for archive_method in active_methods: archive_method(link_dir, link, overwrite=overwrite) + write_link_index(link_dir, link) + update_main_index(link) except Exception as err: print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) @@ -218,7 +220,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) end() - output = wget_output_path(link, look_in=domain_dir) + output = wget_output_path(link) output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()] @@ -391,11 +393,13 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): output = 'archive.org.txt' archive_org_url = None + path = os.path.join(link_dir, output) if os.path.exists(path): archive_org_url = open(path, 'r').read().strip() return {'output': archive_org_url, 'status': 'skipped'} + submit_url = 'https://web.archive.org/save/{}'.format(link['url']) CMD = [ CURL_BINARY, @@ -412,7 +416,6 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): end() content_location, errors = parse_archive_dot_org_response(result.stdout) - if content_location: archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: @@ -427,6 +430,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT): output = e print_error_hints(cmd=CMD, pwd=link_dir, err=e) + if not isinstance(output, Exception): # instead of writing None when archive.org rejects the url write the # url to resubmit it to archive.org. This is so when the user visits @@ -499,7 +503,6 @@ def fetch_title(link_dir, link, timeout=TIMEOUT): # TODO: figure out how to do this without gnarly string replacement if title: link['title'] = title - patch_index_title_hack(link['url'], title) return { 'cmd': 'fetch_page_title("{}")'.format(link['url']), diff --git a/archivebox/index.py b/archivebox/index.py index 8b4e8a4e..dc6cf1bc 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -22,8 +22,11 @@ from util import ( pretty_path, check_link_structure, check_links_structure, + wget_output_path, ) +TITLE_LOADING_MSG = 'Not yet archived...' + ### Homepage index for all the links @@ -96,9 +99,20 @@ def write_html_links_index(out_dir, links, finished=False): with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f: link_row_html = f.read() + full_links_info = (derived_link_info(link) for link in links) + link_rows = '\n'.join( - Template(link_row_html).substitute(**derived_link_info(link)) - for link in links + Template(link_row_html).substitute(**{ + **link, + 'title': ( + link['title'] + or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) + ), + 'archive_url': ( + wget_output_path(link) or 'index.html' + ), + }) + for link in full_links_info ) template_vars = { @@ -118,24 +132,41 @@ def write_html_links_index(out_dir, links, finished=False): chmod_file(path) -def patch_index_title_hack(link_url, new_title): - """hack to update just one link's title in the link index json""" +def update_main_index(link): + """hack to in-place update one row's info in the generated index html""" + title = link['latest']['title'] + successful = len([entry for entry in link['latest'].values() if entry]) + + # Patch JSON index json_path = os.path.join(OUTPUT_DIR, 'index.json') links = parse_json_links_index(OUTPUT_DIR) changed = False - for link in links: - if link['url'] == link_url: - link['title'] = new_title + for json_link in links: + if json_link['url'] == link['url']: + json_link['title'] = title + json_link['latest'] = link['latest'] changed = True break if changed: write_json_links_index(OUTPUT_DIR, links) + # Patch HTML index + html_path = os.path.join(OUTPUT_DIR, 'index.html') + html = open(html_path, 'r').read().split('\n') + for idx, line in enumerate(html): + if title and (' @@ -121,12 +143,8 @@ Bookmarked - Files Saved Link ($num_links) - PNG - PDF - HTML - A.org + Saved Files Original URL diff --git a/archivebox/templates/index_row.html b/archivebox/templates/index_row.html index a6e7da2b..da4e6775 100644 --- a/archivebox/templates/index_row.html +++ b/archivebox/templates/index_row.html @@ -1,16 +1,18 @@ - + $bookmarked_date - + + + $title + $tags + - - $title $tags + + 📄 + $num_outputs + - 🖼 - 📜 - 📄 - 🏛 $url diff --git a/archivebox/util.py b/archivebox/util.py index cd7e9651..b5cb475a 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -244,7 +244,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): # )) return None -def wget_output_path(link, look_in=None): +def wget_output_path(link): """calculate the path to the wgetted .html file, since wget may adjust some paths to be different than the base_url path.