properly handle querystrings for wget .html appended links

2024-06-26 10:00:19 +12:00 · 2017-07-05 17:15:56 -05:00 · 2017-07-05 17:15:56 -05:00 · 2265f2aaf0
parent 6bb91fbb45
commit 2265f2aaf0
1 changed files with 8 additions and 2 deletions
--- a/parse.py
+++ b/parse.py
@ -175,16 +175,22 @@ def html_appended_url(link):
    See docs on wget --adjust-extension."""

    split_url = link['url'].split('#', 1)
+    query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''

    if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
        # already ends in .html
        return link['base_url']
    else:
        # .html needs to be appended
-        without_scheme = split_url[0].split('://', 1)[-1]
+        without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
        if without_scheme.endswith('/'):
+            if query:
+                return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
            return '#'.join([without_scheme + 'index.html', *split_url[1:]])
-        return '#'.join([without_scheme + '.html', *split_url[1:]])
+        else:
+            if query:
+                return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
+            return '#'.join([without_scheme + '.html', *split_url[1:]])


 def derived_link_info(link):