From 0f7dba07dfe673d5915c1bfb344a24b4cb027e84 Mon Sep 17 00:00:00 2001 From: JDC Date: Thu, 19 Nov 2020 23:39:28 -0500 Subject: [PATCH] feat: add search filter-type to list command --- archivebox/cli/archivebox_list.py | 2 +- archivebox/index/__init__.py | 34 ++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 140810a6..3838cf60 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex','tag'), + choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 3a066e18..34e2c5ff 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -51,6 +51,8 @@ from .sql import ( write_sql_link_details, ) +from ..search import search_backend_enabled, query_search_index + ### Link filtering and checking @enforce_types @@ -365,7 +367,7 @@ LINK_FILTERS = { } @enforce_types -def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: +def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: q_filter = Q() for pattern in filter_patterns: try: @@ -380,6 +382,36 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type raise SystemExit(2) return snapshots.filter(q_filter) +def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet: + if not search_backend_enabled(): + stderr() + stderr( + '[X] The search backend is not enabled', + color='red', + ) + raise SystemExit(2) + + qsearch = get_empty_snapshot_queryset() + for pattern in filter_patterns: + try: + qsearch |= query_search_index(pattern) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', + color='red', + ) + raise SystemExit(2) + + return snapshots & qsearch + +@enforce_types +def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: + if filter_type != 'search': + return q_filter(snapshots, filter_patterns, filter_type) + else: + return search_filter(snapshots, filter_patterns, filter_type) + def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity"""