diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 75068456..bb176bfb 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.extractors' import re +from html.parser import HTMLParser from pathlib import Path from typing import Optional @@ -24,11 +25,32 @@ from ..config import ( from ..logging_util import TimedProgress -HTML_TITLE_REGEX = re.compile( - r'' # start matching text after tag - r'(.[^<>]+)', # get everything up to these symbols - re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, -) +class TitleParser(HTMLParser): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.title_tag = "" + self.title_og = "" + self.inside_title_tag = False + + @property + def title(self): + return self.title_tag or self.title_og or None + + def handle_starttag(self, tag, attrs): + if tag.lower() == "title" and not self.title_tag: + self.inside_title_tag = True + elif tag.lower() == "meta" and not self.title_og: + attrs = dict(attrs) + if attrs.get("property") == "og:title" and attrs.get("content"): + self.title_og = attrs.get("content") + + def handle_data(self, data): + if self.inside_title_tag and data: + self.title_tag += data.strip() + + def handle_endtag(self, tag): + if tag.lower() == "title": + self.inside_title_tag = False @enforce_types @@ -62,8 +84,9 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - timer = TimedProgress(timeout, prefix=' ') try: html = download_url(link.url, timeout=timeout) - match = re.search(HTML_TITLE_REGEX, html) - output = htmldecode(match.group(1).strip()) if match else None + parser = TitleParser() + parser.feed(html) + output = parser.title if output: if not link.title or len(output) >= len(link.title): Snapshot.objects.filter(url=link.url, timestamp=link.timestamp).update(title=output)