From 914750c4531877d7b86e997152ed37d965c79151 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Mar 2019 18:09:06 -0400 Subject: [PATCH] better title regex to match titles surrounded by newlines --- archivebox/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/util.py b/archivebox/util.py index fb803732..cd7e9651 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -66,9 +66,9 @@ URL_REGEX = re.compile( re.IGNORECASE, ) HTML_TITLE_REGEX = re.compile( - r'' # start matching text after <title> tag + r'<title.*?>' # start matching text after <title> tag r'(.[^<>]+)', # get everything up to these symbols - re.IGNORECASE, + re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, ) ### Checks & Tests