From f67a5a215a558e912f0e9b554821674c9618a453 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 6 Apr 2021 02:01:38 -0400 Subject: [PATCH] fix readability indexing process and implement a max total character length on indexed content --- archivebox/search/backends/sonic.py | 10 ++++++++-- archivebox/search/utils.py | 8 ++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py index f3ef6628..0d7a4605 100644 --- a/archivebox/search/backends/sonic.py +++ b/archivebox/search/backends/sonic.py @@ -5,13 +5,19 @@ from sonic import IngestClient, SearchClient from archivebox.util import enforce_types from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION -MAX_SONIC_TEXT_LENGTH = 2000 +MAX_SONIC_TEXT_TOTAL_LENGTH = 100000000 # dont index more than 100 million characters per text +MAX_SONIC_TEXT_CHUNK_LENGTH = 2000 # dont index more than 2000 characters per chunk + @enforce_types def index(snapshot_id: str, texts: List[str]): with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: for text in texts: - chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)] + max_length = 1000000 + chunks = ( + text[i:i+MAX_SONIC_TEXT_CHUNK_LENGTH] + for i in range(0, min(len(text), MAX_SONIC_TEXT_TOTAL_LENGTH), MAX_SONIC_TEXT_CHUNK_LENGTH) + ) for chunk in chunks: ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk)) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index 82d1880e..723c7fb5 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -36,10 +36,10 @@ def get_indexable_content(results: QuerySet): # TODO: banish this duplication and get these from the extractor file if method == 'readability': - return get_file_result_content(res, 'content.txt') + return get_file_result_content(res, 'content.txt', use_pwd=True) elif method == 'singlefile': - return get_file_result_content(res,'',use_pwd=True) + return get_file_result_content(res, '', use_pwd=True) elif method == 'dom': - return get_file_result_content(res,'',use_pwd=True) + return get_file_result_content(res, '', use_pwd=True) elif method == 'wget': - return get_file_result_content(res,'',use_pwd=True) + return get_file_result_content(res, '', use_pwd=True)