From c039ef05b3c1d019544db34c3acde445782ed46e Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Tue, 8 Aug 2023 15:09:11 -0400 Subject: [PATCH] Fix hyphen placement in util.URL_REGEX Incorrect hyphen placement in `URL_REGEX` was allowing it to match more characters than intended. In a regex character class, a literal hyphen can only appear as the first character in the class, or it will be interpreted as the delimiter of a range of characters. The issue fixed here caused the range of characters from `[$-_]` be treated as valid URL characters, instead of the intended set of three characters `[-_$]`. The incorrect range interpretation inadvertantly included most ASCII punctuation, most importantly the angle brackets, square brackets, and single quote that the expression uses to mark the end of a match. This causes the expression to match a URL that has a "hostname" portion beginning with one of the intended "stop parsing" characters. For example: ``` https://www.example.com/ # MATCHES but should not https://[for example] # MATCHES but should not scheme='https://' # MATCHES, including final quote, but should not ``` Some test cases have been added to the `URL_REGEX` assert in archivebox.parsers to cover this possibility. --- archivebox/parsers/__init__.py | 4 ++++ archivebox/util.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index c033ab28..99d11a1b 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -233,6 +233,10 @@ _test_url_strs = { 'https://example.com/?what=1#how-about-this=1&2%20baf': 1, 'https://example.com?what=1#how-about-this=1&2%20baf': 1, 'http://example7.com': 1, + 'https://': 0, + 'https://[test]': 0, + 'http://"test"': 0, + 'http://\'test\'': 0, '[https://example8.com/what/is/this.php?what=1]': 1, '[and http://example9.com?what=1&other=3#and-thing=2]': 1, 'https://example10.com#and-thing=2 "': 1, diff --git a/archivebox/util.py b/archivebox/util.py index daf3025e..cfa7d931 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -59,7 +59,7 @@ URL_REGEX = re.compile( r'(?=(' r'http[s]?://' # start matching from allowed schemes r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters - r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols + r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen) r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols r'))',