From c37941efd162569c79e566afabd6f250408e8d26 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 31 Jan 2019 19:46:27 -0800 Subject: [PATCH] dont match closing tags in full text --- archivebox/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 2a5dc468..1dbb9f21 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -43,7 +43,7 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links short_ts = lambda ts: ts.split('.')[0] -URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' +URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<]+' def check_dependencies():