1
0
Fork 0
mirror of synced 2024-07-01 12:30:24 +12:00

fix wget_output_path urlencoding

This commit is contained in:
Nick Sweeting 2018-04-17 09:13:38 -04:00
parent 64e6eb5f7b
commit b7cae4f72e
3 changed files with 36 additions and 16 deletions

View file

@ -7,7 +7,7 @@ from subprocess import run, PIPE, DEVNULL
from peekable import Peekable from peekable import Peekable
from index import html_appended_url, parse_json_link_index, write_link_index from index import wget_output_path, parse_json_link_index, write_link_index
from links import links_after_timestamp from links import links_after_timestamp
from config import ( from config import (
ARCHIVE_DIR, ARCHIVE_DIR,
@ -182,8 +182,9 @@ def attach_result_to_link(method):
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT): def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
"""download full site using wget""" """download full site using wget"""
if os.path.exists(os.path.join(link_dir, link['domain'])): domain_dir = os.path.join(link_dir, link['domain'])
return {'output': html_appended_url(link), 'status': 'skipped'} if os.path.exists(domain_dir):
return {'output': wget_output_path(link, look_in=domain_dir), 'status': 'skipped'}
CMD = [ CMD = [
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
@ -220,7 +221,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
if link['type'] in ('PDF', 'image'): if link['type'] in ('PDF', 'image'):
return {'output': html_appended_url(link)} return {'output': wget_output_path(link)}
if os.path.exists(os.path.join(link_dir, 'output.pdf')): if os.path.exists(os.path.join(link_dir, 'output.pdf')):
return {'output': 'output.pdf', 'status': 'skipped'} return {'output': 'output.pdf', 'status': 'skipped'}
@ -256,7 +257,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
if link['type'] in ('PDF', 'image'): if link['type'] in ('PDF', 'image'):
return {'output': html_appended_url(link)} return {'output': wget_output_path(link)}
if os.path.exists(os.path.join(link_dir, 'screenshot.png')): if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
return {'output': 'screenshot.png', 'status': 'skipped'} return {'output': 'screenshot.png', 'status': 'skipped'}

View file

@ -14,10 +14,11 @@ from config import (
ARCHIVE_DIR, ARCHIVE_DIR,
ANSI, ANSI,
GIT_SHA, GIT_SHA,
FOOTER_INFO,
) )
from util import ( from util import (
chmod_file, chmod_file,
html_appended_url, wget_output_path,
derived_link_info, derived_link_info,
) )

38
util.py
View file

@ -8,10 +8,12 @@ import requests
from datetime import datetime from datetime import datetime
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE, DEVNULL
from multiprocessing import Process from multiprocessing import Process
from urllib.parse import quote
from config import ( from config import (
IS_TTY, IS_TTY,
ARCHIVE_PERMISSIONS, ARCHIVE_PERMISSIONS,
HTML_FOLDER,
ARCHIVE_DIR, ARCHIVE_DIR,
TIMEOUT, TIMEOUT,
TERM_WIDTH, TERM_WIDTH,
@ -394,35 +396,51 @@ def cleanup_archive(archive_path, links):
print(' '+ '\n '.join(unmatched)) print(' '+ '\n '.join(unmatched))
def html_appended_url(link): def wget_output_path(link, look_in=None):
"""calculate the path to the wgetted .html file, since wget may """calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path. adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension. See docs on wget --adjust-extension (-E)
""" """
if link['type'] in ('PDF', 'image'): if link['type'] in ('PDF', 'image'):
return link['base_url'] return quote(link['base_url'])
# Since the wget algorithm to for -E (appending .html) is incredibly complex
# instead of trying to emulate it here, we just look in the output folder
# to see what html file wget actually created as the output
wget_folder = link['base_url'].rsplit('/', 1)[0]
look_in = look_in or os.path.join(HTML_FOLDER, 'archive', link['timestamp'], wget_folder)
if look_in and os.path.exists(look_in):
html_files = [
f for f in os.listdir(look_in)
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
]
if html_files:
return quote(os.path.join(wget_folder, html_files[0]))
# If finding the actual output file didn't work, fall back to the buggy
# implementation of the wget .html appending algorithm
split_url = link['url'].split('#', 1) split_url = link['url'].split('#', 1)
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
# already ends in .html # already ends in .html
return link['base_url'] return quote(link['base_url'])
else: else:
# .html needs to be appended # .html needs to be appended
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
if without_scheme.endswith('/'): if without_scheme.endswith('/'):
if query: if query:
return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]) return quote('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
return '#'.join([without_scheme + 'index.html', *split_url[1:]]) return quote('#'.join([without_scheme + 'index.html', *split_url[1:]]))
else: else:
if query: if query:
return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]) return quote('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
elif '/' in without_scheme: elif '/' in without_scheme:
return '#'.join([without_scheme + '.html', *split_url[1:]]) return quote('#'.join([without_scheme + '.html', *split_url[1:]]))
return link['base_url'] + '/index.html' return quote(link['base_url'] + '/index.html')
def derived_link_info(link): def derived_link_info(link):
@ -434,7 +452,7 @@ def derived_link_info(link):
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link), 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
'files_url': 'archive/{timestamp}/index.html'.format(**link), 'files_url': 'archive/{timestamp}/index.html'.format(**link),
'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)), 'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link)),
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link), 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),