diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 0cd39d8a..694ecc79 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -28,7 +28,7 @@ from ..util import ( htmldecode, download_url, enforce_types, - URL_REGEX, + find_all_urls, ) from ..index.schema import Link from ..logging_util import TimedProgress, log_source_saved @@ -202,54 +202,3 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba log_source_saved(source_file=source_path) return source_path - - -# Check that plain text regex URL parsing works as expected -# this is last-line-of-defense to make sure the URL_REGEX isn't -# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib) -# the consequences of bad URL parsing could be disastrous and lead to many -# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking -_test_url_strs = { - 'example.com': 0, - '/example.com': 0, - '//example.com': 0, - ':/example.com': 0, - '://example.com': 0, - 'htt://example8.com': 0, - '/htt://example.com': 0, - 'https://example': 1, - 'https://localhost/2345': 1, - 'https://localhost:1234/123': 1, - '://': 0, - 'https://': 0, - 'http://': 0, - 'ftp://': 0, - 'ftp://example.com': 0, - 'https://example.com': 1, - 'https://example.com/': 1, - 'https://a.example.com': 1, - 'https://a.example.com/': 1, - 'https://a.example.com/what/is/happening.html': 1, - 'https://a.example.com/what/ís/happening.html': 1, - 'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1, - 'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1, - 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1, - 'https://example.com/?what=1#how-about-this=1&2%20baf': 1, - 'https://example.com?what=1#how-about-this=1&2%20baf': 1, - 'http://example7.com': 1, - 'https://': 0, - 'https://[test]': 0, - 'http://"test"': 0, - 'http://\'test\'': 0, - '[https://example8.com/what/is/this.php?what=1]': 1, - '[and http://example9.com?what=1&other=3#and-thing=2]': 1, - 'https://example10.com#and-thing=2 "': 1, - 'abcdef': 1, - 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1, - 'http://examplehttp://15.badc': 2, - 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2, - '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3, -} -for url_str, num_urls in _test_url_strs.items(): - assert len(re.findall(URL_REGEX, url_str)) == num_urls, ( - f'{url_str} does not contain {num_urls} urls') diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py index 95adb018..20b844aa 100644 --- a/archivebox/parsers/generic_html.py +++ b/archivebox/parsers/generic_html.py @@ -10,7 +10,7 @@ from ..index.schema import Link from ..util import ( htmldecode, enforce_types, - URL_REGEX, + find_all_urls, ) from html.parser import HTMLParser from urllib.parse import urljoin @@ -42,8 +42,9 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, if root_url: # resolve relative urls /home.html -> https://example.com/home.html url = urljoin(root_url, url) - - for archivable_url in re.findall(URL_REGEX, url): + # TODO: fix double // getting stripped by urljoin bug https://github.com/python/cpython/issues/96015 + + for archivable_url in find_all_urls(url): yield Link( url=htmldecode(archivable_url), timestamp=str(datetime.now(timezone.utc).timestamp()), diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index 80d97cf5..561514e0 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -11,7 +11,7 @@ from ..index.schema import Link from ..util import ( htmldecode, enforce_types, - URL_REGEX + find_all_urls, ) @@ -39,7 +39,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: pass # otherwise look for anything that looks like a URL in the line - for url in re.findall(URL_REGEX, line): + for url in find_all_urls(line): yield Link( url=htmldecode(url), timestamp=str(datetime.now(timezone.utc).timestamp()), @@ -48,17 +48,6 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: sources=[text_file.name], ) - # look inside the URL for any sub-urls, e.g. for archive.org links - # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ - # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ - for sub_url in re.findall(URL_REGEX, line[1:]): - yield Link( - url=htmldecode(sub_url), - timestamp=str(datetime.now(timezone.utc).timestamp()), - title=None, - tags=None, - sources=[text_file.name], - ) KEY = 'txt' NAME = 'Generic TXT' diff --git a/archivebox/util.py b/archivebox/util.py index dca211ab..e19510f8 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -439,9 +439,12 @@ class ExtendedEncoder(pyjson.JSONEncoder): ### URL PARSING TESTS / ASSERTIONS -# they run at runtime because I like having them inline in this file, -# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks), -# and these assertions are basically instant, so not a big performance cost to do it on startup + +# Check that plain text regex URL parsing works as expected +# this is last-line-of-defense to make sure the URL_REGEX isn't +# misbehaving due to some OS-level or environment level quirks (e.g. regex engine / cpython / locale differences) +# the consequences of bad URL parsing could be disastrous and lead to many +# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c' assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def' @@ -482,3 +485,50 @@ URL_REGEX_TESTS = [ for urls_str, expected_url_matches in URL_REGEX_TESTS: url_matches = list(find_all_urls(urls_str)) assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!' + + +# More test cases +_test_url_strs = { + 'example.com': 0, + '/example.com': 0, + '//example.com': 0, + ':/example.com': 0, + '://example.com': 0, + 'htt://example8.com': 0, + '/htt://example.com': 0, + 'https://example': 1, + 'https://localhost/2345': 1, + 'https://localhost:1234/123': 1, + '://': 0, + 'https://': 0, + 'http://': 0, + 'ftp://': 0, + 'ftp://example.com': 0, + 'https://example.com': 1, + 'https://example.com/': 1, + 'https://a.example.com': 1, + 'https://a.example.com/': 1, + 'https://a.example.com/what/is/happening.html': 1, + 'https://a.example.com/what/ís/happening.html': 1, + 'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1, + 'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1, + 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1, + 'https://example.com/?what=1#how-about-this=1&2%20baf': 1, + 'https://example.com?what=1#how-about-this=1&2%20baf': 1, + 'http://example7.com': 1, + 'https://': 0, + 'https://[test]': 0, + 'http://"test"': 0, + 'http://\'test\'': 0, + '[https://example8.com/what/is/this.php?what=1]': 1, + '[and http://example9.com?what=1&other=3#and-thing=2]': 1, + 'https://example10.com#and-thing=2 "': 1, + 'abcdef': 1, + 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1, + 'http://examplehttp://15.badc': 2, + 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2, + '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3, +} +for url_str, num_urls in _test_url_strs.items(): + assert len(list(find_all_urls(url_str))) == num_urls, ( + f'{url_str} does not contain {num_urls} urls')