import re from subprocess import run, PIPE, DEVNULL from typing import List, Generator from archivebox.config import setup_django, ARCHIVE_DIR, ARCHIVE_DIR_NAME from archivebox.util import enforce_types DEFAULT_ARGUMENTS = '-ilt' # Case insensitive, matching files, types DEFAULT_EXTENSIONS = 'html' REGEX_ARGUMENT = '-e' TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/' ts_regex = re.compile(TIMESTAMP_REGEX) @enforce_types def index(snapshot_id: str, texts: List[str]): return @enforce_types def flush(snapshot_ids: Generator[str, None, None]): return @enforce_types def search(text: str) -> List[str]: is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL) if is_rg_installed.returncode: raise Exception("rg binary not found, install ripgrep to use this backend") setup_django(check_db=True) from core.models import Snapshot rg = run(['rg',DEFAULT_ARGUMENTS, DEFAULT_EXTENSIONS, REGEX_ARGUMENT, text, str(ARCHIVE_DIR)],stdout=PIPE, stderr=PIPE, timeout=60) file_paths = [p.decode().replace(str(ARCHIVE_DIR_NAME), '') for p in rg.stdout.splitlines()] timestamps = set() for path in file_paths: if ts := ts_regex.findall(path): timestamps.add(ts[0]) snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] return snap_ids