From b6a20c962ac7a2abaac0827853d2ae4a5380f5b1 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Thu, 12 Oct 2023 13:06:35 -0400 Subject: [PATCH 1/4] Extract text from singlefile.html when indexing singlefile.html contains a lot of large strings in the form of `data:` URLs, which can be unnecessarily stored in full-text indices. Also, large chunks of JavaScript shouldn't be indexed, either, as they pollute search results for searches about JS functions, etc. This commit takes a blanket approach of parsing singlefile.html as it is read and only outputting text and selected textual attributes (like `alt`) for indexing. --- archivebox/search/utils.py | 100 +++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 5 deletions(-) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index 723c7fb5..4573ca69 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -1,23 +1,113 @@ +from html.parser import HTMLParser +import io + from django.db.models import QuerySet from archivebox.util import enforce_types from archivebox.config import ANSI +BLOCK_SIZE = 32768 + def log_index_started(url): print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI)) print( ) -def get_file_result_content(res, extra_path, use_pwd=False): + +class HTMLTextExtractor(HTMLParser): + + TEXT_ATTRS = ["alt", "cite", "href", "label", "list", "placeholder", "title", "value"] + NOTEXT_TAGS = ["script", "style", "template"] + NOTEXT_HREF = ["data:", "javascript:", "#"] + + def __init__(self): + super().__init__() + + self.output = io.StringIO() + self._tag_stack = [] + + def _is_text_attr(self, name, value): + if not isinstance(value, str): + return False + if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)): + return False + + if name in self.TEXT_ATTRS: + return True + + return False + + def _parent_tag(self): + try: + return self._tag_stack[-1] + except IndexError: + return None + + def _in_notext_tag(self): + return any([t in self._tag_stack for t in self.NOTEXT_TAGS]) + + def handle_starttag(self, tag, attrs): + self._tag_stack.append(tag) + + # Don't write out attribute values if any ancestor + # is in NOTEXT_TAGS + if self._in_notext_tag(): + return + + for name, value in attrs: + if self._is_text_attr(name, value): + self.output.write(value.strip()) + self.output.write(" ") + + def handle_endtag(self, tag): + orig_stack = self._tag_stack.copy() + try: + # Keep popping tags until we find the nearest + # ancestor matching this end tag + while tag != self._tag_stack.pop(): + pass + except IndexError: + # Got to the top of the stack, but somehow missed + # this end tag -- maybe malformed markup -- restore the + # stack + self._tag_stack = orig_stack + + def handle_data(self, data): + # Don't output text data if any ancestor is in NOTEXT_TAGS + if self._in_notext_tag(): + return + if stripped := data.strip(): + self.output.write(stripped) + self.output.write(" ") + + def __str__(self): + return self.output.getvalue() + + +def _read_all(file: io.TextIOBase) -> str: + return file.read() + + +def _extract_html_text(file: io.TextIOBase) -> str: + extractor = HTMLTextExtractor() + while (block := file.read(BLOCK_SIZE)): + extractor.feed(block) + else: + extractor.close() + + return str(extractor) + + +def get_file_result_content(res, extra_path, use_pwd=False, *, filter=_read_all): if use_pwd: fpath = f'{res.pwd}/{res.output}' else: fpath = f'{res.output}' - + if extra_path: fpath = f'{fpath}/{extra_path}' - with open(fpath, 'r', encoding='utf-8') as file: - data = file.read() + with open(fpath, 'r', encoding='utf-8', errors='replace') as file: + data = filter(file) if data: return [data] return [] @@ -38,7 +128,7 @@ def get_indexable_content(results: QuerySet): if method == 'readability': return get_file_result_content(res, 'content.txt', use_pwd=True) elif method == 'singlefile': - return get_file_result_content(res, '', use_pwd=True) + return get_file_result_content(res, '', use_pwd=True, filter=_extract_html_text) elif method == 'dom': return get_file_result_content(res, '', use_pwd=True) elif method == 'wget': From d8aa84ac9864c8a31eed2abcc1dff7901b7e047c Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Thu, 12 Oct 2023 13:14:39 -0400 Subject: [PATCH 2/4] Make extracting text for indexing optional Add a configuration option to enable/disable HTML text extraction for indexing --- archivebox/config.py | 1 + archivebox/search/utils.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 795b98e9..4286ce58 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -209,6 +209,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, + 'SEARCH_PROCESS_HTML': {'type': bool, 'default': True}, # SONIC 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index 4573ca69..f734908c 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -4,7 +4,7 @@ import io from django.db.models import QuerySet from archivebox.util import enforce_types -from archivebox.config import ANSI +from archivebox.config import ANSI, SEARCH_PROCESS_HTML BLOCK_SIZE = 32768 @@ -128,7 +128,8 @@ def get_indexable_content(results: QuerySet): if method == 'readability': return get_file_result_content(res, 'content.txt', use_pwd=True) elif method == 'singlefile': - return get_file_result_content(res, '', use_pwd=True, filter=_extract_html_text) + filter = _extract_html_text if SEARCH_PROCESS_HTML else _read_all + return get_file_result_content(res, '', use_pwd=True, filter=filter) elif method == 'dom': return get_file_result_content(res, '', use_pwd=True) elif method == 'wget': From 6555719489dde081ad01ed89d5aa657993534f3e Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Fri, 13 Oct 2023 18:01:32 -0400 Subject: [PATCH 3/4] Add space after tags when extracting text Add space after any close tag to ensure that tokens that would be rendered separate in HTML get extracted as separate tokens in text. Example: `

First

Second

` --> `First Second` NOT `FirstSecond` --- archivebox/search/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index f734908c..348b5603 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -65,6 +65,11 @@ class HTMLTextExtractor(HTMLParser): # ancestor matching this end tag while tag != self._tag_stack.pop(): pass + # Write a space after every tag, to ensure that tokens + # in tag text aren't concatenated. This may result in + # excess spaces, which should be ignored by search tokenizers. + if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS: + self.output.write(" ") except IndexError: # Got to the top of the stack, but somehow missed # this end tag -- maybe malformed markup -- restore the @@ -75,9 +80,8 @@ class HTMLTextExtractor(HTMLParser): # Don't output text data if any ancestor is in NOTEXT_TAGS if self._in_notext_tag(): return - if stripped := data.strip(): - self.output.write(stripped) - self.output.write(" ") + + self.output.write(data) def __str__(self): return self.output.getvalue() From 310b4d124259f1593a8cb497cc5c7a2aba658504 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Mon, 23 Oct 2023 21:42:25 -0400 Subject: [PATCH 4/4] Add htmltotext extractor Saves HTML text nodes and selected element attributes in `htmltotext.txt` for each Snapshot. Primarily intended to be used for search indexing. --- archivebox/config.py | 1 + .../migrations/0022_auto_20231023_2008.py | 18 ++ archivebox/extractors/__init__.py | 14 +- archivebox/extractors/htmltotext.py | 154 ++++++++++++++++++ archivebox/index/html.py | 2 +- archivebox/index/schema.py | 2 + archivebox/search/utils.py | 107 +----------- tests/fixtures.py | 1 + tests/test_extractors.py | 8 + 9 files changed, 203 insertions(+), 104 deletions(-) create mode 100644 archivebox/core/migrations/0022_auto_20231023_2008.py create mode 100644 archivebox/extractors/htmltotext.py diff --git a/archivebox/config.py b/archivebox/config.py index 4286ce58..37bebfc1 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -134,6 +134,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)}, + 'SAVE_HTMLTOTEXT': {'type': bool, 'default': True, 'aliases': ('FETCH_HTMLTOTEXT',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, diff --git a/archivebox/core/migrations/0022_auto_20231023_2008.py b/archivebox/core/migrations/0022_auto_20231023_2008.py new file mode 100644 index 00000000..1b0becef --- /dev/null +++ b/archivebox/core/migrations/0022_auto_20231023_2008.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.14 on 2023-10-23 20:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0021_auto_20220914_0934'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + ), + ] diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 3ca9cfa7..183f9824 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -33,6 +33,7 @@ from .wget import should_save_wget, save_wget from .singlefile import should_save_singlefile, save_singlefile from .readability import should_save_readability, save_readability from .mercury import should_save_mercury, save_mercury +from .htmltotext import should_save_htmltotext, save_htmltotext from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -51,15 +52,24 @@ def get_default_archive_methods(): ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), ('wget', should_save_wget, save_wget), - ('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them + # keep title, readability, and htmltotext below wget and singlefile, as they depend on them + ('title', should_save_title, save_title), ('readability', should_save_readability, save_readability), ('mercury', should_save_mercury, save_mercury), + ('htmltotext', should_save_htmltotext, save_htmltotext), ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] -ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] +ARCHIVE_METHODS_INDEXING_PRECEDENCE = [ + ('readability', 1), + ('mercury', 2), + ('htmltotext', 3), + ('singlefile', 4), + ('dom', 5), + ('wget', 6) +] @enforce_types def ignore_methods(to_ignore: List[str]): diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py new file mode 100644 index 00000000..18722f13 --- /dev/null +++ b/archivebox/extractors/htmltotext.py @@ -0,0 +1,154 @@ +__package__ = 'archivebox.extractors' + +from html.parser import HTMLParser +import io +from pathlib import Path +from typing import Optional + +from ..config import ( + SAVE_HTMLTOTEXT, + TIMEOUT, + VERSION, +) +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..logging_util import TimedProgress +from ..system import atomic_write +from ..util import ( + enforce_types, + is_static_file, +) +from .title import get_html + +class HTMLTextExtractor(HTMLParser): + TEXT_ATTRS = [ + "alt", "cite", "href", "label", + "list", "placeholder", "title", "value" + ] + NOTEXT_TAGS = ["script", "style", "template"] + NOTEXT_HREF = ["data:", "javascript:", "#"] + + def __init__(self): + super().__init__() + + self.output = io.StringIO() + self._tag_stack = [] + + def _is_text_attr(self, name, value): + if not isinstance(value, str): + return False + if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)): + return False + + if name in self.TEXT_ATTRS: + return True + + return False + + def _parent_tag(self): + try: + return self._tag_stack[-1] + except IndexError: + return None + + def _in_notext_tag(self): + return any([t in self._tag_stack for t in self.NOTEXT_TAGS]) + + def handle_starttag(self, tag, attrs): + self._tag_stack.append(tag) + + # Don't write out attribute values if any ancestor + # is in NOTEXT_TAGS + if self._in_notext_tag(): + return + + for name, value in attrs: + if self._is_text_attr(name, value): + self.output.write(f"({value.strip()}) ") + + def handle_endtag(self, tag): + orig_stack = self._tag_stack.copy() + try: + # Keep popping tags until we find the nearest + # ancestor matching this end tag + while tag != self._tag_stack.pop(): + pass + # Write a space after every tag, to ensure that tokens + # in tag text aren't concatenated. This may result in + # excess spaces, which should be ignored by search tokenizers. + if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS: + self.output.write(" ") + except IndexError: + # Got to the top of the stack, but somehow missed + # this end tag -- maybe malformed markup -- restore the + # stack + self._tag_stack = orig_stack + + def handle_data(self, data): + # Don't output text data if any ancestor is in NOTEXT_TAGS + if self._in_notext_tag(): + return + + data = data.lstrip() + len_before_rstrip = len(data) + data = data.rstrip() + spaces_rstripped = len_before_rstrip - len(data) + if data: + self.output.write(data) + if spaces_rstripped: + # Add back a single space if 1 or more + # whitespace characters were stripped + self.output.write(' ') + + def __str__(self): + return self.output.getvalue() + + +@enforce_types +def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: + if is_static_file(link.url): + return False + + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'htmltotext.txt').exists(): + return False + + return SAVE_HTMLTOTEXT + + +@enforce_types +def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """extract search-indexing-friendly text from an HTML document""" + + out_dir = Path(out_dir or link.link_dir) + output = "htmltotext.txt" + + timer = TimedProgress(timeout, prefix=' ') + extracted_text = None + try: + extractor = HTMLTextExtractor() + document = get_html(link, out_dir) + + if not document: + raise ArchiveError('htmltotext could not find HTML to parse for article text') + + extractor.feed(document) + extractor.close() + extracted_text = str(extractor) + + atomic_write(str(out_dir / output), extracted_text) + except (Exception, OSError) as err: + status = 'failed' + output = err + cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html'] + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=str(out_dir), + cmd_version=VERSION, + output=output, + status=status, + index_texts=[extracted_text] if extracted_text else [], + **timer.stats, + ) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index c0229674..6b914446 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -143,7 +143,7 @@ def snapshot_icons(snapshot) -> str: "mercury": "🅼", "warc": "📦" } - exclude = ["favicon", "title", "headers", "archive_org"] + exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"] # Missing specific entry for WARC extractor_outputs = defaultdict(lambda: None) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index c44165a9..85972993 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -429,6 +429,7 @@ class Link: 'singlefile_path': 'singlefile.html', 'readability_path': 'readability/content.html', 'mercury_path': 'mercury/content.html', + 'htmltotext_path': 'htmltotext.txt', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', @@ -452,6 +453,7 @@ class Link: 'singlefile_path': static_path, 'readability_path': static_path, 'mercury_path': static_path, + 'htmltotext_path': static_path, }) return canonical diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index 348b5603..723c7fb5 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -1,117 +1,23 @@ -from html.parser import HTMLParser -import io - from django.db.models import QuerySet from archivebox.util import enforce_types -from archivebox.config import ANSI, SEARCH_PROCESS_HTML - -BLOCK_SIZE = 32768 +from archivebox.config import ANSI def log_index_started(url): print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI)) print( ) - -class HTMLTextExtractor(HTMLParser): - - TEXT_ATTRS = ["alt", "cite", "href", "label", "list", "placeholder", "title", "value"] - NOTEXT_TAGS = ["script", "style", "template"] - NOTEXT_HREF = ["data:", "javascript:", "#"] - - def __init__(self): - super().__init__() - - self.output = io.StringIO() - self._tag_stack = [] - - def _is_text_attr(self, name, value): - if not isinstance(value, str): - return False - if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)): - return False - - if name in self.TEXT_ATTRS: - return True - - return False - - def _parent_tag(self): - try: - return self._tag_stack[-1] - except IndexError: - return None - - def _in_notext_tag(self): - return any([t in self._tag_stack for t in self.NOTEXT_TAGS]) - - def handle_starttag(self, tag, attrs): - self._tag_stack.append(tag) - - # Don't write out attribute values if any ancestor - # is in NOTEXT_TAGS - if self._in_notext_tag(): - return - - for name, value in attrs: - if self._is_text_attr(name, value): - self.output.write(value.strip()) - self.output.write(" ") - - def handle_endtag(self, tag): - orig_stack = self._tag_stack.copy() - try: - # Keep popping tags until we find the nearest - # ancestor matching this end tag - while tag != self._tag_stack.pop(): - pass - # Write a space after every tag, to ensure that tokens - # in tag text aren't concatenated. This may result in - # excess spaces, which should be ignored by search tokenizers. - if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS: - self.output.write(" ") - except IndexError: - # Got to the top of the stack, but somehow missed - # this end tag -- maybe malformed markup -- restore the - # stack - self._tag_stack = orig_stack - - def handle_data(self, data): - # Don't output text data if any ancestor is in NOTEXT_TAGS - if self._in_notext_tag(): - return - - self.output.write(data) - - def __str__(self): - return self.output.getvalue() - - -def _read_all(file: io.TextIOBase) -> str: - return file.read() - - -def _extract_html_text(file: io.TextIOBase) -> str: - extractor = HTMLTextExtractor() - while (block := file.read(BLOCK_SIZE)): - extractor.feed(block) - else: - extractor.close() - - return str(extractor) - - -def get_file_result_content(res, extra_path, use_pwd=False, *, filter=_read_all): +def get_file_result_content(res, extra_path, use_pwd=False): if use_pwd: fpath = f'{res.pwd}/{res.output}' else: fpath = f'{res.output}' - + if extra_path: fpath = f'{fpath}/{extra_path}' - with open(fpath, 'r', encoding='utf-8', errors='replace') as file: - data = filter(file) + with open(fpath, 'r', encoding='utf-8') as file: + data = file.read() if data: return [data] return [] @@ -132,8 +38,7 @@ def get_indexable_content(results: QuerySet): if method == 'readability': return get_file_result_content(res, 'content.txt', use_pwd=True) elif method == 'singlefile': - filter = _extract_html_text if SEARCH_PROCESS_HTML else _read_all - return get_file_result_content(res, '', use_pwd=True, filter=filter) + return get_file_result_content(res, '', use_pwd=True) elif method == 'dom': return get_file_result_content(res, '', use_pwd=True) elif method == 'wget': diff --git a/tests/fixtures.py b/tests/fixtures.py index cca722f3..e9c0bc48 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -17,6 +17,7 @@ def disable_extractors_dict(): "USE_SINGLEFILE": "false", "USE_READABILITY": "false", "USE_MERCURY": "false", + "SAVE_HTMLTOTEXT": "false", "SAVE_PDF": "false", "SAVE_SCREENSHOT": "false", "SAVE_DOM": "false", diff --git a/tests/test_extractors.py b/tests/test_extractors.py index 86b50d51..bf67b853 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -39,6 +39,14 @@ def test_mercury_works(tmp_path, process, disable_extractors_dict): output_file = archived_item_path / "mercury" / "content.html" assert output_file.exists() +def test_htmltotext_works(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"SAVE_HTMLTOTEXT": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "htmltotext.txt" + assert output_file.exists() + def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict): disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"}) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],