diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py index 20b844aa..67a3208d 100644 --- a/archivebox/parsers/generic_html.py +++ b/archivebox/parsers/generic_html.py @@ -40,9 +40,20 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, parser.feed(line) for url in parser.urls: if root_url: - # resolve relative urls /home.html -> https://example.com/home.html - url = urljoin(root_url, url) - # TODO: fix double // getting stripped by urljoin bug https://github.com/python/cpython/issues/96015 + url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://')) + # url = https://abc.com => True + # url = /page.php?next=https://example.com => False + + if not url_is_absolute: # resolve it by joining it with root_url + relative_path = url + + url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html + # => https://example.com/home.html + + # special case to handle bug around // handling, crucial for urls that contain sub-urls + # e.g. https://web.archive.org/web/https://example.com + if did_urljoin_misbehave(root_url, relative_path, url): + url = fix_urljoin_bug(url) for archivable_url in find_all_urls(url): yield Link( @@ -57,3 +68,74 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, KEY = 'html' NAME = 'Generic HTML' PARSER = parse_generic_html_export + + +#### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 #### + +def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool: + """ + Handle urljoin edge case bug where multiple slashes get turned into a single slash: + - https://github.com/python/cpython/issues/96015 + - https://github.com/ArchiveBox/ArchiveBox/issues/1411 + + This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.: + https://web.archive.org/web/https://example.com/some/inner/url + + But there are other valid URLs containing // that are not fixed by this workaround, e.g.: + https://example.com/drives/C//some/file + """ + + # if relative path is actually an absolute url, cut off its own scheme so we check the path component only + relative_path = relative_path.lower() + if relative_path.startswith('http://') or relative_path.startswith('https://'): + relative_path = relative_path.split('://', 1)[-1] + + # TODO: properly fix all double // getting stripped by urljoin, not just :// + original_path_had_suburl = '://' in relative_path + original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https:// + final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https:// + + urljoin_broke_suburls = ( + (original_root_had_suburl or original_path_had_suburl) + and not final_joined_has_suburl + ) + return urljoin_broke_suburls + + +def fix_urljoin_bug(url: str, nesting_limit=5): + """ + recursively replace broken suburls .../http:/... with .../http://... + + basically equivalent to this for 99.9% of cases: + url = url.replace('/http:/', '/http://') + url = url.replace('/https:/', '/https://') + except this handles: + other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example) + other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home) + fixing multiple suburls recursively + """ + input_url = url + for _ in range(nesting_limit): + url = re.sub( + r'(?P.+?)' # https://web.archive.org/web + + r'(?P[-=/_&+%$#@!*\(\\])' # / + + r'(?P[a-zA-Z0-9+_-]{1,32}?):/' # http:/ + + r'(?P[^/\\]+)', # example.com + r"\1\2\3://\4", + input_url, + re.IGNORECASE | re.UNICODE, + ) + if url == input_url: + break # nothing left to replace, all suburls are fixed + input_url = url + + return url + + +# sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs +assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True +assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True +assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls +assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html' +assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com' +