From b7cae4f72e01dfd5a9d96beb310fcc87b97fb618 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Tue, 17 Apr 2018 09:13:38 -0400
Subject: [PATCH] fix wget_output_path urlencoding

---
 archive_methods.py | 11 ++++++-----
 index.py           |  3 ++-
 util.py            | 38 ++++++++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/archive_methods.py b/archive_methods.py
index 45c56709..d2a567b3 100644
--- a/archive_methods.py
+++ b/archive_methods.py
@@ -7,7 +7,7 @@ from subprocess import run, PIPE, DEVNULL
 
 from peekable import Peekable
 
-from index import html_appended_url, parse_json_link_index, write_link_index
+from index import wget_output_path, parse_json_link_index, write_link_index
 from links import links_after_timestamp
 from config import (
     ARCHIVE_DIR,
@@ -182,8 +182,9 @@ def attach_result_to_link(method):
 def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
     """download full site using wget"""
 
-    if os.path.exists(os.path.join(link_dir, link['domain'])):
-        return {'output': html_appended_url(link), 'status': 'skipped'}
+    domain_dir = os.path.join(link_dir, link['domain'])
+    if os.path.exists(domain_dir):
+        return {'output': wget_output_path(link, look_in=domain_dir), 'status': 'skipped'}
 
     CMD = [
         # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
@@ -220,7 +221,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
     """print PDF of site to file using chrome --headless"""
 
     if link['type'] in ('PDF', 'image'):
-        return {'output': html_appended_url(link)}
+        return {'output': wget_output_path(link)}
     
     if os.path.exists(os.path.join(link_dir, 'output.pdf')):
         return {'output': 'output.pdf', 'status': 'skipped'}
@@ -256,7 +257,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
     """take screenshot of site using chrome --headless"""
 
     if link['type'] in ('PDF', 'image'):
-        return {'output': html_appended_url(link)}
+        return {'output': wget_output_path(link)}
     
     if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
         return {'output': 'screenshot.png', 'status': 'skipped'}
diff --git a/index.py b/index.py
index 196a0f22..b1c3108e 100644
--- a/index.py
+++ b/index.py
@@ -14,10 +14,11 @@ from config import (
     ARCHIVE_DIR,
     ANSI,
     GIT_SHA,
+    FOOTER_INFO,
 )
 from util import (
     chmod_file,
-    html_appended_url,
+    wget_output_path,
     derived_link_info,
 )
 
diff --git a/util.py b/util.py
index 2280db38..63bdcc51 100644
--- a/util.py
+++ b/util.py
@@ -8,10 +8,12 @@ import requests
 from datetime import datetime
 from subprocess import run, PIPE, DEVNULL
 from multiprocessing import Process
+from urllib.parse import quote
 
 from config import (
     IS_TTY,
     ARCHIVE_PERMISSIONS,
+    HTML_FOLDER,
     ARCHIVE_DIR,
     TIMEOUT,
     TERM_WIDTH,
@@ -394,35 +396,51 @@ def cleanup_archive(archive_path, links):
         print('    '+ '\n    '.join(unmatched))
 
 
-def html_appended_url(link):
+def wget_output_path(link, look_in=None):
     """calculate the path to the wgetted .html file, since wget may
     adjust some paths to be different than the base_url path.
 
-    See docs on wget --adjust-extension.
+    See docs on wget --adjust-extension (-E)
     """
 
     if link['type'] in ('PDF', 'image'):
-        return link['base_url']
+        return quote(link['base_url'])
 
+    # Since the wget algorithm to for -E (appending .html) is incredibly complex
+    # instead of trying to emulate it here, we just look in the output folder
+    # to see what html file wget actually created as the output
+    wget_folder = link['base_url'].rsplit('/', 1)[0]
+    look_in = look_in or os.path.join(HTML_FOLDER, 'archive', link['timestamp'], wget_folder)
+
+    if look_in and os.path.exists(look_in):
+        html_files = [
+            f for f in os.listdir(look_in)
+            if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
+        ]
+        if html_files:
+            return quote(os.path.join(wget_folder, html_files[0]))
+
+    # If finding the actual output file didn't work, fall back to the buggy
+    # implementation of the wget .html appending algorithm
     split_url = link['url'].split('#', 1)
     query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
 
     if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
         # already ends in .html
-        return link['base_url']
+        return quote(link['base_url'])
     else:
         # .html needs to be appended
         without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
         if without_scheme.endswith('/'):
             if query:
-                return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
-            return '#'.join([without_scheme + 'index.html', *split_url[1:]])
+                return quote('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
+            return quote('#'.join([without_scheme + 'index.html', *split_url[1:]]))
         else:
             if query:
-                return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
+                return quote('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
             elif '/' in without_scheme:
-                return '#'.join([without_scheme + '.html', *split_url[1:]])
-            return link['base_url'] + '/index.html'
+                return quote('#'.join([without_scheme + '.html', *split_url[1:]]))
+            return quote(link['base_url'] + '/index.html')
 
 
 def derived_link_info(link):
@@ -434,7 +452,7 @@ def derived_link_info(link):
         'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
         'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
         'files_url': 'archive/{timestamp}/index.html'.format(**link),
-        'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
+        'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link)),
         'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
         'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
         'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),