From 77917e9b5527cae659604286aec96760e409bf21 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Mon, 9 Oct 2023 02:00:01 -0500 Subject: [PATCH] Fix HTML title parsing bugs. This slightly modifies the HTML_TITLE_REGEX to fix two parsing errors. The first occurred when title tags were empty (e.g. "") which was parsed as "A") which was not matched by the regex, and so would fall back to link.base_url. Now when tags are empty, it falls back to link.base_url, and single character titles are parsed correctly. The way the regex works now is still a bit wonky for some edge cases. I couldn't find any cases of incorrect behavior, but it still might be worth reworking more completely for robustness. --- archivebox/extractors/title.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 19a78591..dc496c4e 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -26,7 +26,7 @@ from ..logging_util import TimedProgress HTML_TITLE_REGEX = re.compile( r'' # start matching text after tag - r'(.[^<>]+)', # get everything up to these symbols + r'([^<>]+)', # get everything up to these symbols re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, )