diff --git a/archivebox/config.py b/archivebox/config.py index cee39b6e..99a1847c 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -141,6 +141,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)}, + 'SAVE_HTMLTOTEXT': {'type': bool, 'default': True, 'aliases': ('FETCH_HTMLTOTEXT',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, @@ -218,6 +219,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, + 'SEARCH_PROCESS_HTML': {'type': bool, 'default': True}, # SONIC 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, diff --git a/archivebox/core/migrations/0022_auto_20231023_2008.py b/archivebox/core/migrations/0022_auto_20231023_2008.py new file mode 100644 index 00000000..1b0becef --- /dev/null +++ b/archivebox/core/migrations/0022_auto_20231023_2008.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.14 on 2023-10-23 20:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0021_auto_20220914_0934'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + ), + ] diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 38710182..edcf218b 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -37,6 +37,7 @@ from .wget import should_save_wget, save_wget from .singlefile import should_save_singlefile, save_singlefile from .readability import should_save_readability, save_readability from .mercury import should_save_mercury, save_mercury +from .htmltotext import should_save_htmltotext, save_htmltotext from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -59,14 +60,26 @@ def get_default_archive_methods() -> List[ArchiveMethodEntry]: ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), ('wget', should_save_wget, save_wget), - ('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them + # keep title, readability, and htmltotext below wget and singlefile, as they depend on them + ('title', should_save_title, save_title), ('readability', should_save_readability, save_readability), ('mercury', should_save_mercury, save_mercury), + ('htmltotext', should_save_htmltotext, save_htmltotext), ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] +ARCHIVE_METHODS_INDEXING_PRECEDENCE = [ + ('readability', 1), + ('mercury', 2), + ('htmltotext', 3), + ('singlefile', 4), + ('dom', 5), + ('wget', 6) +] + + @enforce_types def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]: DEFAULT_METHODS = get_default_archive_methods() @@ -86,8 +99,6 @@ def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]: return (m for m in DEFAULT_METHODS if m[0] in allowed_methods) -ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] - @enforce_types def ignore_methods(to_ignore: List[str]) -> Iterable[str]: ARCHIVE_METHODS = get_default_archive_methods() diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py new file mode 100644 index 00000000..18722f13 --- /dev/null +++ b/archivebox/extractors/htmltotext.py @@ -0,0 +1,154 @@ +__package__ = 'archivebox.extractors' + +from html.parser import HTMLParser +import io +from pathlib import Path +from typing import Optional + +from ..config import ( + SAVE_HTMLTOTEXT, + TIMEOUT, + VERSION, +) +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..logging_util import TimedProgress +from ..system import atomic_write +from ..util import ( + enforce_types, + is_static_file, +) +from .title import get_html + +class HTMLTextExtractor(HTMLParser): + TEXT_ATTRS = [ + "alt", "cite", "href", "label", + "list", "placeholder", "title", "value" + ] + NOTEXT_TAGS = ["script", "style", "template"] + NOTEXT_HREF = ["data:", "javascript:", "#"] + + def __init__(self): + super().__init__() + + self.output = io.StringIO() + self._tag_stack = [] + + def _is_text_attr(self, name, value): + if not isinstance(value, str): + return False + if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)): + return False + + if name in self.TEXT_ATTRS: + return True + + return False + + def _parent_tag(self): + try: + return self._tag_stack[-1] + except IndexError: + return None + + def _in_notext_tag(self): + return any([t in self._tag_stack for t in self.NOTEXT_TAGS]) + + def handle_starttag(self, tag, attrs): + self._tag_stack.append(tag) + + # Don't write out attribute values if any ancestor + # is in NOTEXT_TAGS + if self._in_notext_tag(): + return + + for name, value in attrs: + if self._is_text_attr(name, value): + self.output.write(f"({value.strip()}) ") + + def handle_endtag(self, tag): + orig_stack = self._tag_stack.copy() + try: + # Keep popping tags until we find the nearest + # ancestor matching this end tag + while tag != self._tag_stack.pop(): + pass + # Write a space after every tag, to ensure that tokens + # in tag text aren't concatenated. This may result in + # excess spaces, which should be ignored by search tokenizers. + if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS: + self.output.write(" ") + except IndexError: + # Got to the top of the stack, but somehow missed + # this end tag -- maybe malformed markup -- restore the + # stack + self._tag_stack = orig_stack + + def handle_data(self, data): + # Don't output text data if any ancestor is in NOTEXT_TAGS + if self._in_notext_tag(): + return + + data = data.lstrip() + len_before_rstrip = len(data) + data = data.rstrip() + spaces_rstripped = len_before_rstrip - len(data) + if data: + self.output.write(data) + if spaces_rstripped: + # Add back a single space if 1 or more + # whitespace characters were stripped + self.output.write(' ') + + def __str__(self): + return self.output.getvalue() + + +@enforce_types +def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: + if is_static_file(link.url): + return False + + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'htmltotext.txt').exists(): + return False + + return SAVE_HTMLTOTEXT + + +@enforce_types +def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """extract search-indexing-friendly text from an HTML document""" + + out_dir = Path(out_dir or link.link_dir) + output = "htmltotext.txt" + + timer = TimedProgress(timeout, prefix=' ') + extracted_text = None + try: + extractor = HTMLTextExtractor() + document = get_html(link, out_dir) + + if not document: + raise ArchiveError('htmltotext could not find HTML to parse for article text') + + extractor.feed(document) + extractor.close() + extracted_text = str(extractor) + + atomic_write(str(out_dir / output), extracted_text) + except (Exception, OSError) as err: + status = 'failed' + output = err + cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html'] + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=str(out_dir), + cmd_version=VERSION, + output=output, + status=status, + index_texts=[extracted_text] if extracted_text else [], + **timer.stats, + ) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index c0229674..6b914446 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -143,7 +143,7 @@ def snapshot_icons(snapshot) -> str: "mercury": "🅼", "warc": "📦" } - exclude = ["favicon", "title", "headers", "archive_org"] + exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"] # Missing specific entry for WARC extractor_outputs = defaultdict(lambda: None) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index c44165a9..85972993 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -429,6 +429,7 @@ class Link: 'singlefile_path': 'singlefile.html', 'readability_path': 'readability/content.html', 'mercury_path': 'mercury/content.html', + 'htmltotext_path': 'htmltotext.txt', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', @@ -452,6 +453,7 @@ class Link: 'singlefile_path': static_path, 'readability_path': static_path, 'mercury_path': static_path, + 'htmltotext_path': static_path, }) return canonical diff --git a/tests/fixtures.py b/tests/fixtures.py index cca722f3..e9c0bc48 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -17,6 +17,7 @@ def disable_extractors_dict(): "USE_SINGLEFILE": "false", "USE_READABILITY": "false", "USE_MERCURY": "false", + "SAVE_HTMLTOTEXT": "false", "SAVE_PDF": "false", "SAVE_SCREENSHOT": "false", "SAVE_DOM": "false", diff --git a/tests/test_extractors.py b/tests/test_extractors.py index bd6d2775..9568f7ef 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -78,6 +78,14 @@ def test_mercury_works(tmp_path, process, disable_extractors_dict): output_file = archived_item_path / "mercury" / "content.html" assert output_file.exists() +def test_htmltotext_works(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"SAVE_HTMLTOTEXT": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "htmltotext.txt" + assert output_file.exists() + def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict): disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"}) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],