new compiled URL regex with better markdown support

2024-06-03 02:54:32 +12:00 · 2019-02-27 04:49:25 -05:00 · 2019-02-27 04:49:25 -05:00 · ef4c446c8b
parent b2c22a73e6
commit ef4c446c8b
2 changed files with 40 additions and 3 deletions
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@ -27,12 +27,15 @@ from util import (
    str_between,
    get_link_type,
    URL_REGEX,
+    check_url_parsing,
 )


 def parse_links(path):
    """parse a list of links dictionaries from a bookmark export file"""
    
+    check_url_parsing()
+
    links = []
    with open(path, 'r', encoding='utf-8') as file:
        print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
@ -192,7 +195,6 @@ def parse_shaarli_rss_export(rss_file):

        yield info

-
 def parse_netscape_html_export(html_file):
    """Parse netscape-format bookmarks export files (produced by all browsers)"""

--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -58,8 +58,19 @@ base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links

 short_ts = lambda ts: ts.split('.')[0]

-URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
-HTML_TITLE_REGEX = '<title>(.[^<>]+)'
+URL_REGEX = re.compile(
+    r'http[s]?://'                    # start matching from allowed schemes
+    r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
+    r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
+    r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
+    r'[^\]\[\(\)<>\""\'\s]+',         # stop parsing at these symbols
+    re.IGNORECASE,
+)
+HTML_TITLE_REGEX = re.compile(
+    r'<title>'                         # start matching text after <title> tag
+    r'(.[^<>]+)',                      # get everything up to these symbols
+    re.IGNORECASE,
+)


 def check_dependencies():
@ -124,6 +135,30 @@ def check_dependencies():
            raise SystemExit(1)


+def check_url_parsing():
+    """Check that plain text regex URL parsing works as expected"""
+    test_urls = '''
+    https://example1.com/what/is/happening.html?what=1#how-about-this=1
+    https://example2.com/what/is/happening/?what=1#how-about-this=1
+    HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
+    https://example4.com/what/is/happening.html
+    https://example5.com/
+    https://example6.com
+
+    <test>http://example7.com</test>
+    [https://example8.com/what/is/this.php?what=1]
+    [and http://example9.com?what=1&other=3#and-thing=2]
+    <what>https://example10.com#and-thing=2 "</about>
+    abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
+    sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
+    example13.bada
+    and example14.badb
+    <or>htt://example15.badc</that>
+    '''
+    # print('\n'.join(re.findall(URL_REGEX, test_urls)))
+    assert len(re.findall(URL_REGEX, test_urls)) == 12
+
+
 def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
    """chmod -R <permissions> <cwd>/<path>"""