diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index c033ab28..99d11a1b 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -233,6 +233,10 @@ _test_url_strs = { 'https://example.com/?what=1#how-about-this=1&2%20baf': 1, 'https://example.com?what=1#how-about-this=1&2%20baf': 1, 'http://example7.com': 1, + 'https://': 0, + 'https://[test]': 0, + 'http://"test"': 0, + 'http://\'test\'': 0, '[https://example8.com/what/is/this.php?what=1]': 1, '[and http://example9.com?what=1&other=3#and-thing=2]': 1, 'https://example10.com#and-thing=2 "': 1, diff --git a/archivebox/util.py b/archivebox/util.py index daf3025e..cfa7d931 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -59,7 +59,7 @@ URL_REGEX = re.compile( r'(?=(' r'http[s]?://' # start matching from allowed schemes r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters - r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols + r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen) r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols r'))',