1
0
Fork 0
mirror of synced 2024-06-02 10:34:43 +12:00

support finding multiple urls as substrings in text

This commit is contained in:
Nick Sweeting 2021-03-27 04:30:40 -04:00
parent f3a3d76439
commit 3e26ae4a66

View file

@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
URL_REGEX = re.compile(
r'(?=('
r'http[s]?://' # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
r'[^\]\[\(\)<>"\'\s]+', # stop parsing at these symbols
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
r'))',
re.IGNORECASE,
)