1
0
Fork 0
mirror of synced 2024-06-29 03:20:58 +12:00

fix pdf and screenshot links

This commit is contained in:
Nick Sweeting 2017-07-04 04:28:26 -05:00
parent 7b72156afd
commit f33330ebbf

View file

@ -168,7 +168,7 @@ def fetch_wget(out_dir, link, overwrite=False):
if not os.path.exists('{}/{}'.format(out_dir, domain)) or overwrite: if not os.path.exists('{}/{}'.format(out_dir, domain)) or overwrite:
print(' - Downloading Full Site') print(' - Downloading Full Site')
CMD = [ CMD = [
*'wget --timestamping --adjust-extension --no-parent'.split(' '), *'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html
*(('--page-requisites', '--convert-links') if FETCH_WGET_REQUISITES else ()), *(('--page-requisites', '--convert-links') if FETCH_WGET_REQUISITES else ()),
link['url'], link['url'],
] ]
@ -327,8 +327,9 @@ def valid_links(links):
return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp')) return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp'))
def calculate_archive_url(link): def calculate_archive_url(link):
"""calculate the path to the wgetted html file, since wget may adjust some paths """calculate the path to the wgetted html file, since wget may
to be different than the base_url path adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension.""" See docs on wget --adjust-extension."""
split_url = link['url'].split('#', 1) split_url = link['url'].split('#', 1)
@ -370,21 +371,18 @@ def dump_index(links, service):
# since we dont screenshot or PDF links that are images or PDFs, change those links to point to the wget'ed file # since we dont screenshot or PDF links that are images or PDFs, change those links to point to the wget'ed file
link_info = {**link} link_info = {**link}
# append .html to archive links that dont have it, since wget appends .html to everything # PDF and images are handled slightly differently
link_info['archive_url'] = calculate_archive_url(link) # wget, screenshot, & pdf urls all point to the same file
# add link type to title
if link['type']:
link_info.update({'title': '{title} ({type})'.format(**link)})
# PDF and images link to wgetted version, since we dont re-screenshot/pdf them
if link['type'] in ('PDF', 'image'): if link['type'] in ('PDF', 'image'):
link_info.update({ link_info.update({
'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
'title': '{title} ({type})'.format(**link),
}) })
else: else:
link_info.update({ link_info.update({
'archive_url': calculate_archive_url(link),
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link) 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link)
}) })