From 2265f2aaf08ccea2a2938ba4e568c9c257ce5919 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 5 Jul 2017 17:15:56 -0500 Subject: [PATCH] properly handle querystrings for wget .html appended links --- parse.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/parse.py b/parse.py index 8f807346..7604418a 100644 --- a/parse.py +++ b/parse.py @@ -175,16 +175,22 @@ def html_appended_url(link): See docs on wget --adjust-extension.""" split_url = link['url'].split('#', 1) + query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): # already ends in .html return link['base_url'] else: # .html needs to be appended - without_scheme = split_url[0].split('://', 1)[-1] + without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] if without_scheme.endswith('/'): + if query: + return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]) return '#'.join([without_scheme + 'index.html', *split_url[1:]]) - return '#'.join([without_scheme + '.html', *split_url[1:]]) + else: + if query: + return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]) + return '#'.join([without_scheme + '.html', *split_url[1:]]) def derived_link_info(link):