__package__ = 'archivebox.parsers' import re from typing import IO, Iterable, Optional from datetime import datetime, timezone from ..index.schema import Link from ..util import ( htmldecode, enforce_types, URL_REGEX, ) from html.parser import HTMLParser from urllib.parse import urljoin class HrefParser(HTMLParser): def __init__(self): super().__init__() self.urls = [] def handle_starttag(self, tag, attrs): if tag == "a": for attr, value in attrs: if attr == "href": self.urls.append(value) @enforce_types def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]: """Parse Generic HTML for href tags and use only the url (support for title coming later)""" html_file.seek(0) for line in html_file: parser = HrefParser() # example line #
  • example title
  • parser.feed(line) for url in parser.urls: if root_url: # resolve relative urls /home.html -> https://example.com/home.html url = urljoin(root_url, url) for archivable_url in re.findall(URL_REGEX, url): yield Link( url=htmldecode(archivable_url), timestamp=str(datetime.now(timezone.utc).timestamp()), title=None, tags=None, sources=[html_file.name], ) KEY = 'html' NAME = 'Generic HTML' PARSER = parse_generic_html_export