From 55a237a435abf14a008db8a55967ac75254a778c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 28 Jul 2020 05:56:34 -0400 Subject: [PATCH] also set snapshot title inside of fetch_title directly --- archivebox/extractors/title.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 642c45b7..f75edbb5 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -63,7 +63,10 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> html = download_url(link.url, timeout=timeout) match = re.search(HTML_TITLE_REGEX, html) output = htmldecode(match.group(1).strip()) if match else None - if not output: + if output: + if not link.title or len(output) >= len(link.title): + Snapshot.objects.filter(url=link.url, timestamp=link.timestamp).update(title=output) + else: raise ArchiveError('Unable to detect page title') except Exception as err: status = 'failed'