1
0
Fork 0
mirror of synced 2024-06-28 19:10:33 +12:00

better UX before titles have been fetched during archiving progress

This commit is contained in:
Nick Sweeting 2019-03-19 18:09:46 -04:00
parent 914750c453
commit eb5cc8078a
5 changed files with 90 additions and 27 deletions

View file

@ -7,7 +7,7 @@ from datetime import datetime
from index import ( from index import (
parse_json_link_index, parse_json_link_index,
write_link_index, write_link_index,
patch_index_title_hack, update_main_index,
) )
from config import ( from config import (
CURL_BINARY, CURL_BINARY,
@ -103,7 +103,9 @@ def archive_link(link_dir, link, overwrite=True):
for archive_method in active_methods: for archive_method in active_methods:
archive_method(link_dir, link, overwrite=overwrite) archive_method(link_dir, link, overwrite=overwrite)
write_link_index(link_dir, link) write_link_index(link_dir, link)
update_main_index(link)
except Exception as err: except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
@ -218,7 +220,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end() end()
output = wget_output_path(link, look_in=domain_dir) output = wget_output_path(link)
output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()] output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
@ -391,11 +393,13 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
output = 'archive.org.txt' output = 'archive.org.txt'
archive_org_url = None archive_org_url = None
path = os.path.join(link_dir, output) path = os.path.join(link_dir, output)
if os.path.exists(path): if os.path.exists(path):
archive_org_url = open(path, 'r').read().strip() archive_org_url = open(path, 'r').read().strip()
return {'output': archive_org_url, 'status': 'skipped'} return {'output': archive_org_url, 'status': 'skipped'}
submit_url = 'https://web.archive.org/save/{}'.format(link['url']) submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
CMD = [ CMD = [
CURL_BINARY, CURL_BINARY,
@ -412,7 +416,6 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
end() end()
content_location, errors = parse_archive_dot_org_response(result.stdout) content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location: if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
@ -427,6 +430,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
output = e output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e) print_error_hints(cmd=CMD, pwd=link_dir, err=e)
if not isinstance(output, Exception): if not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the # instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits # url to resubmit it to archive.org. This is so when the user visits
@ -499,7 +503,6 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
# TODO: figure out how to do this without gnarly string replacement # TODO: figure out how to do this without gnarly string replacement
if title: if title:
link['title'] = title link['title'] = title
patch_index_title_hack(link['url'], title)
return { return {
'cmd': 'fetch_page_title("{}")'.format(link['url']), 'cmd': 'fetch_page_title("{}")'.format(link['url']),

View file

@ -22,8 +22,11 @@ from util import (
pretty_path, pretty_path,
check_link_structure, check_link_structure,
check_links_structure, check_links_structure,
wget_output_path,
) )
TITLE_LOADING_MSG = 'Not yet archived...'
### Homepage index for all the links ### Homepage index for all the links
@ -96,9 +99,20 @@ def write_html_links_index(out_dir, links, finished=False):
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f: with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
link_row_html = f.read() link_row_html = f.read()
full_links_info = (derived_link_info(link) for link in links)
link_rows = '\n'.join( link_rows = '\n'.join(
Template(link_row_html).substitute(**derived_link_info(link)) Template(link_row_html).substitute(**{
for link in links **link,
'title': (
link['title']
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
),
'archive_url': (
wget_output_path(link) or 'index.html'
),
})
for link in full_links_info
) )
template_vars = { template_vars = {
@ -118,24 +132,41 @@ def write_html_links_index(out_dir, links, finished=False):
chmod_file(path) chmod_file(path)
def patch_index_title_hack(link_url, new_title): def update_main_index(link):
"""hack to update just one link's title in the link index json""" """hack to in-place update one row's info in the generated index html"""
title = link['latest']['title']
successful = len([entry for entry in link['latest'].values() if entry])
# Patch JSON index
json_path = os.path.join(OUTPUT_DIR, 'index.json') json_path = os.path.join(OUTPUT_DIR, 'index.json')
links = parse_json_links_index(OUTPUT_DIR) links = parse_json_links_index(OUTPUT_DIR)
changed = False changed = False
for link in links: for json_link in links:
if link['url'] == link_url: if json_link['url'] == link['url']:
link['title'] = new_title json_link['title'] = title
json_link['latest'] = link['latest']
changed = True changed = True
break break
if changed: if changed:
write_json_links_index(OUTPUT_DIR, links) write_json_links_index(OUTPUT_DIR, links)
# Patch HTML index
html_path = os.path.join(OUTPUT_DIR, 'index.html')
html = open(html_path, 'r').read().split('\n')
for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
html[idx] = '<span>{}</span>'.format(title)
elif successful and ('<span data-number-for="{}"'.format(link['url']) in line):
html[idx] = '<span>{}</span>'.format(successful)
break
with open(html_path, 'w') as f:
f.write('\n'.join(html))
### Individual link index ### Individual link index
@ -176,10 +207,19 @@ def write_html_link_index(out_dir, link):
print(' √ index.html') print(' √ index.html')
link = derived_link_info(link)
with open(path, 'w', encoding='utf-8') as f: with open(path, 'w', encoding='utf-8') as f:
f.write(Template(link_html).substitute({ f.write(Template(link_html).substitute({
**derived_link_info(link), **link,
# **link['latest'], 'title': (
link['title']
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
),
'archive_url': (
wget_output_path(link)
or (link['domain'] if link['is_archived'] else 'about:blank')
),
})) }))
chmod_file(path) chmod_file(path)

View file

@ -98,6 +98,28 @@
overflow-y: scroll; overflow-y: scroll;
table-layout: fixed; table-layout: fixed;
} }
table tr a span[data-archived~=False] {
opacity: 0.2;
}
.files-spinner {
height: 15px;
width: auto;
opacity: 0.5;
vertical-align: -2px;
}
.link-favicon {
padding-right: 8px;
vertical-align: -4px;
}
.in-progress {
display: none;
}
body[data-status~=finished] .files-spinner {
display: none;
}
body[data-status~=running] .in-progress {
display: inline-block;
}
</style> </style>
</head> </head>
<body data-status="$status"> <body data-status="$status">
@ -121,12 +143,8 @@
<thead> <thead>
<tr> <tr>
<th style="width: 80px;">Bookmarked</th> <th style="width: 80px;">Bookmarked</th>
<th style="width: 26px;">Files</th>
<th style="width: 26vw;">Saved Link ($num_links)</th> <th style="width: 26vw;">Saved Link ($num_links)</th>
<th style="width: 30px;">PNG</th> <th style="width: 50px">Saved Files</th>
<th style="width: 30px">PDF</th>
<th style="width: 30px">HTML</th>
<th style="width: 30px">A.org</th>
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th> <th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
</tr> </tr>
</thead> </thead>

View file

@ -1,16 +1,18 @@
<tr> <tr data-url="$url">
<td title="Bookmarked timestamp: $timestamp">$bookmarked_date</td> <td title="Bookmarked timestamp: $timestamp">$bookmarked_date</td>
<td> <td style="text-align: left">
<a href="$link_dir/$index_url" title="Link Index"> <a href="$link_dir/$index_url" title="Link Index">
<img src="$link_dir/$favicon_url" onerror="this.src='static/spinner.gif'" class="link-favicon"> <img src="$link_dir/$favicon_url" onerror="this.src='static/spinner.gif'" class="link-favicon">
</a> </a>
<a href="$link_dir/$archive_url" style="font-size:1.4em;text-decoration:none;color:black;" title="$title">
<span data-title-for="$url" data-archived="$is_archived">$title</span>
<small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
</a>
</td> </td>
<td style="text-align: left"><a href="$link_dir/$archive_url" style="font-size:1.4em;text-decoration:none;color:black;" title="$title"> <td>
$title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small> <a href="$link_dir/$index_url">📄
<span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner"/></span>
</a>
</td> </td>
<td><a href="$link_dir/$screenshot_url" title="Screenshot">🖼</a></td>
<td><a href="$link_dir/$pdf_url" title="PDF">📜</a></td>
<td><a href="$link_dir/$dom_url" title="DOM">📄</a></td>
<td><a href="$archive_org_url" title="Archive.org">🏛</a></td>
<td style="text-align: left"><!--🔗 <img src="$google_favicon_url" height="16px">--> <a href="$url">$url</a></td> <td style="text-align: left"><!--🔗 <img src="$google_favicon_url" height="16px">--> <a href="$url">$url</a></td>
</tr> </tr>

View file

@ -244,7 +244,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
# )) # ))
return None return None
def wget_output_path(link, look_in=None): def wget_output_path(link):
"""calculate the path to the wgetted .html file, since wget may """calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path. adjust some paths to be different than the base_url path.