From 3ac0efb6195757b46474eddb934c64117fc2f482 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 4 Feb 2019 18:54:02 -0800 Subject: [PATCH] dont parse quotes as part of urls --- archivebox/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 1dbb9f21..f708a4bf 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -43,7 +43,7 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links short_ts = lambda ts: ts.split('.')[0] -URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<]+' +URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+' def check_dependencies():