From c53ec45a296b264ca4d0a1301f33b580e88b4c3f Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Sun, 8 Oct 2023 15:01:59 -0400 Subject: [PATCH 1/6] WIP: add sqlite search backend boilerplate --- archivebox/search/backends/sqlite.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 archivebox/search/backends/sqlite.py diff --git a/archivebox/search/backends/sqlite.py b/archivebox/search/backends/sqlite.py new file mode 100644 index 00000000..622673b7 --- /dev/null +++ b/archivebox/search/backends/sqlite.py @@ -0,0 +1,15 @@ +from typing import List, Generator + +from archivebox.util import enforce_types + +@enforce_types +def index(snapshot_id: str, texts: List[str]): + pass + +@enforce_types +def search(text: str) -> List[str]: + pass + +@enforce_types +def flush(snapshot_ids: Generator[str, None, None]): + pass From 8fe5faf4d062da9de8c0d4f9261287b691bc3193 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Mon, 9 Oct 2023 16:43:08 -0400 Subject: [PATCH 2/6] Introduce SQLite FTS5-powered search backend Use SQLite's FTS5 extension to power full-text search without any additional dependencies. FTS5 was introduced in SQLite 3.9.0, [released][1] in 2015 so should be available on most SQLite installations at this point in time. [1]: https://www.sqlite.org/changes.html#version_3_9_0 --- archivebox/config.py | 6 + archivebox/search/backends/sqlite.py | 157 ++++++++++++++++++++++++++- 2 files changed, 159 insertions(+), 4 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 795b98e9..eaf0b6bf 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -213,6 +213,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, 'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90}, + # SQLite3 FTS5 + 'FTS_SEPARATE_DATABASE': {'type': bool, 'default': True}, + 'FTS_TOKENIZERS': {'type': str, 'default': 'porter unicode61 remove_diacritics 2'}, + # Default from https://www.sqlite.org/limits.html#max_length + 'FTS_SQLITE_MAX_LENGTH': {'type': int, 'default': int(1e9)}, }, 'DEPENDENCY_CONFIG': { @@ -345,6 +350,7 @@ ALLOWED_IN_OUTPUT_DIR = { 'yarn.lock', 'static', 'sonic', + 'search.sqlite3', ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, diff --git a/archivebox/search/backends/sqlite.py b/archivebox/search/backends/sqlite.py index 622673b7..ef93522f 100644 --- a/archivebox/search/backends/sqlite.py +++ b/archivebox/search/backends/sqlite.py @@ -1,15 +1,164 @@ -from typing import List, Generator +import codecs +from typing import List, Optional, Generator +import sqlite3 from archivebox.util import enforce_types +from archivebox.config import ( + FTS_SEPARATE_DATABASE, + FTS_TOKENIZERS, + FTS_SQLITE_MAX_LENGTH +) + +FTS_TABLE = "snapshot_fts" +FTS_ID_TABLE = "snapshot_id_fts" +FTS_COLUMN = "texts" + +if FTS_SEPARATE_DATABASE: + database = sqlite3.connect("search.sqlite3") + # Make connection callable, because `django.db.connection.cursor()` + # has to be called to get a context manager, but sqlite3.Connection + # is a context manager without being called. + def connection(): + return database + SQLITE_BIND = "?" +else: + from django.db import connection as database + connection = database.cursor + SQLITE_BIND = "%s" + +# Only Python >= 3.11 supports sqlite3.Connection.getlimit(), +# so fall back to the default if the API to get the real value isn't present +try: + limit_id = sqlite3.SQLITE_LIMIT_LENGTH + try: + with database.temporary_connection() as cursor: + SQLITE_LIMIT_LENGTH = cursor.connection.getlimit(limit_id) + except AttributeError: + SQLITE_LIMIT_LENGTH = database.getlimit(limit_id) +except AttributeError: + SQLITE_LIMIT_LENGTH = FTS_SQLITE_MAX_LENGTH + + +def _escape_sqlite3(value: str, *, quote: str, errors='strict') -> str: + assert isinstance(quote, str), "quote is not a str" + assert len(quote) == 1, "quote must be a single character" + + encodable = value.encode('utf-8', errors).decode('utf-8') + + nul_index = encodable.find("\x00") + if nul_index >= 0: + error = UnicodeEncodeError("NUL-terminated utf-8", encodable, + nul_index, nul_index + 1, "NUL not allowed") + error_handler = codecs.lookup_error(errors) + replacement, _ = error_handler(error) + encodable = encodable.replace("\x00", replacement) + + return quote + encodable.replace(quote, quote * 2) + quote + +def _escape_sqlite3_value(value: str, errors='strict') -> str: + return _escape_sqlite3(value, quote="'", errors=errors) + +def _escape_sqlite3_identifier(value: str) -> str: + return _escape_sqlite3(value, quote='"', errors='strict') + +@enforce_types +def _create_tables(): + table = _escape_sqlite3_identifier(FTS_TABLE) + # Escape as value, because fts5() expects + # string literal column names + column = _escape_sqlite3_value(FTS_COLUMN) + id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) + tokenizers = _escape_sqlite3_value(FTS_TOKENIZERS) + trigger_name = _escape_sqlite3_identifier(f"{FTS_ID_TABLE}_ad") + + with connection() as cursor: + # Create a contentless-delete FTS5 table that indexes + # but does not store the texts of snapshots + cursor.execute( + f"CREATE VIRTUAL TABLE {table}" + f" USING fts5({column}," + f" tokenize={tokenizers}," + " content='', contentless_delete=1);" + ) + # Create a one-to-one mapping between ArchiveBox snapshot_id + # and FTS5 rowid, because the column type of rowid can't be + # customized. + cursor.execute( + f"CREATE TABLE {id_table}(" + " rowid INTEGER PRIMARY KEY AUTOINCREMENT," + " snapshot_id char(32) NOT NULL UNIQUE" + ");" + ) + # Create a trigger to delete items from the FTS5 index when + # the snapshot_id is deleted from the mapping, to maintain + # consistency and make the `flush()` query simpler. + cursor.execute( + f"CREATE TRIGGER {trigger_name}" + f" AFTER DELETE ON {id_table} BEGIN" + f" DELETE FROM {table} WHERE rowid=old.rowid;" + " END;" + ) @enforce_types def index(snapshot_id: str, texts: List[str]): - pass + text = ' '.join(texts)[:SQLITE_LIMIT_LENGTH] + + table = _escape_sqlite3_identifier(FTS_TABLE) + column = _escape_sqlite3_identifier(FTS_COLUMN) + id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) + + with connection() as cursor: + retries = 2 + while retries > 0: + retries -= 1 + try: + # If there is already an FTS index rowid to snapshot_id mapping, + # then don't insert a new one, silently ignoring the operation. + # {id_table}.rowid is AUTOINCREMENT, so will generate an unused + # rowid for the index if it is an unindexed snapshot_id. + cursor.execute( + f"INSERT OR IGNORE INTO {id_table}(snapshot_id) VALUES({SQLITE_BIND})", + [snapshot_id]) + # Fetch the FTS index rowid for the given snapshot_id + id_res = cursor.execute( + f"SELECT rowid FROM {id_table} WHERE snapshot_id = {SQLITE_BIND}", + [snapshot_id]) + rowid = id_res.fetchone()[0] + # (Re-)index the content + cursor.execute( + "INSERT OR REPLACE INTO" + f" {table}(rowid, {column}) VALUES ({SQLITE_BIND}, {SQLITE_BIND})", + [rowid, text]) + # All statements succeeded; break retry loop + break + except Exception as e: + if str(e).startswith(f"no such table:"): + _create_tables() + else: + raise @enforce_types def search(text: str) -> List[str]: - pass + table = _escape_sqlite3_identifier(FTS_TABLE) + id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) + + with connection() as cursor: + res = cursor.execute( + f"SELECT snapshot_id FROM {table}" + f" INNER JOIN {id_table}" + f" ON {id_table}.rowid = {table}.rowid" + f" WHERE {table} MATCH {SQLITE_BIND}", + [text]) + snap_ids = [row[0] for row in res.fetchall()] + return snap_ids @enforce_types def flush(snapshot_ids: Generator[str, None, None]): - pass + snapshot_ids = list(snapshot_ids) + + id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) + + with connection() as cursor: + cursor.executemany( + f"DELETE FROM {id_table} WHERE snapshot_id={SQLITE_BIND}", + [snapshot_ids]) From e0f8eeeaa77b581756d32f6525374695610d3c37 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Tue, 10 Oct 2023 10:30:52 -0400 Subject: [PATCH 3/6] Improve search.backends.sqlite retry logic Retry with table creation should fail if it is attempted for a second time. --- archivebox/search/backends/sqlite.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/archivebox/search/backends/sqlite.py b/archivebox/search/backends/sqlite.py index ef93522f..b2c75bba 100644 --- a/archivebox/search/backends/sqlite.py +++ b/archivebox/search/backends/sqlite.py @@ -129,14 +129,16 @@ def index(snapshot_id: str, texts: List[str]): "INSERT OR REPLACE INTO" f" {table}(rowid, {column}) VALUES ({SQLITE_BIND}, {SQLITE_BIND})", [rowid, text]) - # All statements succeeded; break retry loop - break + # All statements succeeded; return + return except Exception as e: - if str(e).startswith(f"no such table:"): + if str(e).startswith("no such table:") and retries > 0: _create_tables() else: raise + raise RuntimeError("Failed to create tables for SQLite FTS5 search") + @enforce_types def search(text: str) -> List[str]: table = _escape_sqlite3_identifier(FTS_TABLE) From adb9f0ecc9f0f8edee49ccfb2c3f966367c2bce3 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Fri, 13 Oct 2023 09:26:50 -0400 Subject: [PATCH 4/6] sqlite search: Rename `connection` to `get_connection` `connection` could cause confusion with `django.db.connection` and `get_connection` is a better callable name. --- archivebox/search/backends/sqlite.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/archivebox/search/backends/sqlite.py b/archivebox/search/backends/sqlite.py index b2c75bba..4ed9e79c 100644 --- a/archivebox/search/backends/sqlite.py +++ b/archivebox/search/backends/sqlite.py @@ -15,15 +15,15 @@ FTS_COLUMN = "texts" if FTS_SEPARATE_DATABASE: database = sqlite3.connect("search.sqlite3") - # Make connection callable, because `django.db.connection.cursor()` + # Make get_connection callable, because `django.db.connection.cursor()` # has to be called to get a context manager, but sqlite3.Connection # is a context manager without being called. - def connection(): + def get_connection(): return database SQLITE_BIND = "?" else: from django.db import connection as database - connection = database.cursor + get_connection = database.cursor SQLITE_BIND = "%s" # Only Python >= 3.11 supports sqlite3.Connection.getlimit(), @@ -71,7 +71,7 @@ def _create_tables(): tokenizers = _escape_sqlite3_value(FTS_TOKENIZERS) trigger_name = _escape_sqlite3_identifier(f"{FTS_ID_TABLE}_ad") - with connection() as cursor: + with get_connection() as cursor: # Create a contentless-delete FTS5 table that indexes # but does not store the texts of snapshots cursor.execute( @@ -107,7 +107,7 @@ def index(snapshot_id: str, texts: List[str]): column = _escape_sqlite3_identifier(FTS_COLUMN) id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) - with connection() as cursor: + with get_connection() as cursor: retries = 2 while retries > 0: retries -= 1 @@ -144,7 +144,7 @@ def search(text: str) -> List[str]: table = _escape_sqlite3_identifier(FTS_TABLE) id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) - with connection() as cursor: + with get_connection() as cursor: res = cursor.execute( f"SELECT snapshot_id FROM {table}" f" INNER JOIN {id_table}" @@ -160,7 +160,7 @@ def flush(snapshot_ids: Generator[str, None, None]): id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) - with connection() as cursor: + with get_connection() as cursor: cursor.executemany( f"DELETE FROM {id_table} WHERE snapshot_id={SQLITE_BIND}", [snapshot_ids]) From 1e604a1352cfd838d2b7504dad3d92fd3c79e3d3 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Mon, 16 Oct 2023 14:31:52 -0400 Subject: [PATCH 5/6] sqlite search: clean up errors and type-checking Clean up error handling, and report a better error message on search and flush if FTS5 tables haven't yet been created. Add some mypy comments to clean up type-checking errors. --- archivebox/search/backends/sqlite.py | 44 ++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/archivebox/search/backends/sqlite.py b/archivebox/search/backends/sqlite.py index 4ed9e79c..b4c61efb 100644 --- a/archivebox/search/backends/sqlite.py +++ b/archivebox/search/backends/sqlite.py @@ -1,5 +1,5 @@ import codecs -from typing import List, Optional, Generator +from typing import List, Generator import sqlite3 from archivebox.util import enforce_types @@ -22,7 +22,7 @@ if FTS_SEPARATE_DATABASE: return database SQLITE_BIND = "?" else: - from django.db import connection as database + from django.db import connection as database # type: ignore[no-redef, assignment] get_connection = database.cursor SQLITE_BIND = "%s" @@ -31,7 +31,7 @@ else: try: limit_id = sqlite3.SQLITE_LIMIT_LENGTH try: - with database.temporary_connection() as cursor: + with database.temporary_connection() as cursor: # type: ignore[attr-defined] SQLITE_LIMIT_LENGTH = cursor.connection.getlimit(limit_id) except AttributeError: SQLITE_LIMIT_LENGTH = database.getlimit(limit_id) @@ -51,6 +51,7 @@ def _escape_sqlite3(value: str, *, quote: str, errors='strict') -> str: nul_index, nul_index + 1, "NUL not allowed") error_handler = codecs.lookup_error(errors) replacement, _ = error_handler(error) + assert isinstance(replacement, str), "handling a UnicodeEncodeError should return a str replacement" encodable = encodable.replace("\x00", replacement) return quote + encodable.replace(quote, quote * 2) + quote @@ -99,6 +100,16 @@ def _create_tables(): " END;" ) +def _handle_query_exception(exc: Exception): + message = str(exc) + if message.startswith("no such table:"): + raise RuntimeError( + "SQLite full-text search index has not yet" + " been created; run `archivebox update --index-only`." + ) + else: + raise exc + @enforce_types def index(snapshot_id: str, texts: List[str]): text = ' '.join(texts)[:SQLITE_LIMIT_LENGTH] @@ -145,22 +156,29 @@ def search(text: str) -> List[str]: id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) with get_connection() as cursor: - res = cursor.execute( - f"SELECT snapshot_id FROM {table}" - f" INNER JOIN {id_table}" - f" ON {id_table}.rowid = {table}.rowid" - f" WHERE {table} MATCH {SQLITE_BIND}", - [text]) + try: + res = cursor.execute( + f"SELECT snapshot_id FROM {table}" + f" INNER JOIN {id_table}" + f" ON {id_table}.rowid = {table}.rowid" + f" WHERE {table} MATCH {SQLITE_BIND}", + [text]) + except Exception as e: + _handle_query_exception(e) + snap_ids = [row[0] for row in res.fetchall()] return snap_ids @enforce_types def flush(snapshot_ids: Generator[str, None, None]): - snapshot_ids = list(snapshot_ids) + snapshot_ids = list(snapshot_ids) # type: ignore[assignment] id_table = _escape_sqlite3_identifier(FTS_ID_TABLE) with get_connection() as cursor: - cursor.executemany( - f"DELETE FROM {id_table} WHERE snapshot_id={SQLITE_BIND}", - [snapshot_ids]) + try: + cursor.executemany( + f"DELETE FROM {id_table} WHERE snapshot_id={SQLITE_BIND}", + [snapshot_ids]) + except Exception as e: + _handle_query_exception(e) From 9b85f35b63c874fcc738aed1966490f397adaf0b Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Mon, 16 Oct 2023 14:50:48 -0400 Subject: [PATCH 6/6] sqlite search: check SQLite version when indexing If creating the FTS5 tables fails due to a known version incompatiblity, report the required version to the user. --- archivebox/search/backends/sqlite.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/archivebox/search/backends/sqlite.py b/archivebox/search/backends/sqlite.py index b4c61efb..2fee789a 100644 --- a/archivebox/search/backends/sqlite.py +++ b/archivebox/search/backends/sqlite.py @@ -75,12 +75,23 @@ def _create_tables(): with get_connection() as cursor: # Create a contentless-delete FTS5 table that indexes # but does not store the texts of snapshots - cursor.execute( - f"CREATE VIRTUAL TABLE {table}" - f" USING fts5({column}," - f" tokenize={tokenizers}," - " content='', contentless_delete=1);" - ) + try: + cursor.execute( + f"CREATE VIRTUAL TABLE {table}" + f" USING fts5({column}," + f" tokenize={tokenizers}," + " content='', contentless_delete=1);" + ) + except Exception as e: + msg = str(e) + if 'unrecognized option: "contentlessdelete"' in msg: + sqlite_version = getattr(sqlite3, "sqlite_version", "Unknown") + raise RuntimeError( + "SQLite full-text search requires SQLite >= 3.43.0;" + f" the running version is {sqlite_version}" + ) from e + else: + raise # Create a one-to-one mapping between ArchiveBox snapshot_id # and FTS5 rowid, because the column type of rowid can't be # customized.