Merge pull request #570 from ArchiveBox/sonic-search

2024-06-02 02:25:20 +12:00 · 2020-12-05 18:22:17 -05:00 · 2020-12-05 18:22:17 -05:00 · 8d103687d0
parent 7bc13204e6 172197ae01
commit 8d103687d0
20 changed files with 406 additions and 12 deletions
--- a/2
+++ b/2
@ -46,7 +46,7 @@ RUN apt-get update -qq \
 # Install apt dependencies
 RUN apt-get update -qq \
    && apt-get install -qq -y --no-install-recommends \
-        wget curl chromium git ffmpeg youtube-dl \
+        wget curl chromium git ffmpeg youtube-dl ripgrep \
        fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
    && rm -rf /var/lib/apt/lists/*

--- a/archivebox.egg-info
+++ b/archivebox.egg-info
@ -1 +0,0 @@
-pip_dist/archivebox.egg-info
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
    parser.add_argument(
        '--filter-type',
        type=str,
-        choices=('exact', 'substring', 'domain', 'regex','tag'),
+        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
        default='exact',
        help='Type of pattern matching to use when filtering URLs',
    )
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
    parser.add_argument(
        '--filter-type',
        type=str,
-        choices=('exact', 'substring', 'domain', 'regex'),
+        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
        default='exact',
        help='Type of pattern matching to use when filtering URLs',
    )
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -139,6 +139,18 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
    },

+    'SEARCH_BACKEND_CONFIG' : {
+        'USE_INDEXING_BACKEND':     {'type': bool,  'default': True},
+        'USE_SEARCHING_BACKEND':    {'type': bool,  'default': True},
+        'SEARCH_BACKEND_ENGINE':    {'type': str,   'default': 'ripgrep'},
+        'SEARCH_BACKEND_HOST_NAME': {'type': str,   'default': 'localhost'},
+        'SEARCH_BACKEND_PORT':      {'type': int,   'default': 1491},
+        'SEARCH_BACKEND_PASSWORD':  {'type': str,   'default': 'SecretPassword'},
+        # SONIC
+        'SONIC_COLLECTION':         {'type': str,   'default': 'archivebox'},
+        'SONIC_BUCKET':             {'type': str,   'default': 'snapshots'},
+    },
+
    'DEPENDENCY_CONFIG': {
        'USE_CURL':                 {'type': bool,  'default': True},
        'USE_WGET':                 {'type': bool,  'default': True},
@ -149,7 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
        'USE_CHROME':               {'type': bool,  'default': True},
        'USE_NODE':                 {'type': bool,  'default': True},
        'USE_YOUTUBEDL':            {'type': bool,  'default': True},
-
+        
        'CURL_BINARY':              {'type': str,   'default': 'curl'},
        'GIT_BINARY':               {'type': str,   'default': 'git'},
        'WGET_BINARY':              {'type': str,   'default': 'wget'},
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -14,6 +14,9 @@ from django import forms
 from core.models import Snapshot, Tag
 from core.forms import AddLinkForm, TagField

+from core.utils import get_icons
+from core.mixins import SearchResultsAdminMixin
+
 from index.html import snapshot_icons
 from util import htmldecode, urldecode, ansi_to_html
 from logging_util import printable_filesize
@ -82,7 +85,7 @@ class SnapshotAdminForm(forms.ModelForm):
        return instance


-class SnapshotAdmin(admin.ModelAdmin):
+class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
    list_display = ('added', 'title_str', 'url_str', 'files', 'size')
    sort_fields = ('title_str', 'url_str', 'added')
    readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
--- a/archivebox/core/mixins.py
+++ b/archivebox/core/mixins.py
@ -0,0 +1,23 @@
+from django.contrib import messages
+
+from archivebox.search import query_search_index
+
+class SearchResultsAdminMixin(object):
+    def get_search_results(self, request, queryset, search_term):
+        ''' Enhances the search queryset with results from the search backend.
+        '''
+        qs, use_distinct = \
+            super(SearchResultsAdminMixin, self).get_search_results(
+                request, queryset, search_term)
+
+        search_term = search_term.strip()
+        if not search_term:
+            return qs, use_distinct
+        try:
+            qsearch = query_search_index(search_term)
+        except Exception as err:
+            messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
+        else:
+            qs |= qsearch
+        finally:
+            return qs, use_distinct
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -5,10 +5,11 @@ import uuid
 from django.db import models, transaction
 from django.utils.functional import cached_property
 from django.utils.text import slugify
+from django.db.models import Case, When, Value, IntegerField

 from ..util import parse_date
 from ..index.schema import Link
-from ..extractors import get_default_archive_methods
+from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE

 EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
 STATUS_CHOICES = [
@ -91,7 +92,7 @@ class Snapshot(models.Model):
        return {
            key: getattr(self, key)
            if key != 'tags' else self.tags_str()
-            for key in args 
+            for key in args
        }

    def as_link(self) -> Link:
@ -100,7 +101,7 @@ class Snapshot(models.Model):
    def as_link_with_details(self) -> Link:
        from ..index import load_link_details
        return load_link_details(self.as_link())
-    
+
    def tags_str(self) -> str:
        return ','.join(self.tags.order_by('name').values_list('name', flat=True))

@ -157,7 +158,15 @@ class Snapshot(models.Model):
        self.tags.clear()
        self.tags.add(*tags_id)

+class ArchiveResultManager(models.Manager):
+    def indexable(self, sorted: bool = True):
+        INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
+        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')

+        if sorted:
+            precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
+            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
+        return qs
 class ArchiveResult(models.Model):
    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
    cmd = models.JSONField()
@ -169,5 +178,7 @@ class ArchiveResult(models.Model):
    status = models.CharField(max_length=16, choices=STATUS_CHOICES)
    extractor = models.CharField(choices=EXTRACTORS, max_length=32)

+    objects = ArchiveResultManager()
+
    def __str__(self):
        return self.extractor
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -23,6 +23,7 @@ from ..logging_util import (
    log_archive_method_started,
    log_archive_method_finished,
 )
+from ..search import write_search_index

 from .title import should_save_title, save_title
 from .favicon import should_save_favicon, save_favicon
@ -38,6 +39,7 @@ from .media import should_save_media, save_media
 from .archive_org import should_save_archive_dot_org, save_archive_dot_org
 from .headers import should_save_headers, save_headers

+
 def get_default_archive_methods():
    return [
        ('title', should_save_title, save_title),
@ -55,6 +57,8 @@ def get_default_archive_methods():
        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
    ]

+ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
+
@enforce_types
 def ignore_methods(to_ignore: List[str]):
    ARCHIVE_METHODS = get_default_archive_methods()
@ -107,6 +111,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                    link.history[method_name].append(result)

                    stats[result.status] += 1
+                    write_search_index(link=link, texts=result.index_texts)
                    log_archive_method_finished(result)
                    if not skip_index:
                        ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@ -71,6 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        CURL_BINARY,
        link.url
    ]
+    readability_content = None
    timer = TimedProgress(timeout, prefix='      ')
    try:
        document = get_html(link, out_dir)
@ -86,8 +87,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        result = run(cmd, cwd=out_dir, timeout=timeout)
        result_json = json.loads(result.stdout)
        output_folder.mkdir(exist_ok=True)
+        readability_content = result_json.pop("textContent") 
        atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent"))
+        atomic_write(str(output_folder / "content.txt"), readability_content)
        atomic_write(str(output_folder / "article.json"), result_json)

        # parse out number of files downloaded from last line of stderr:
@ -117,5 +119,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        cmd_version=READABILITY_VERSION,
        output=output,
        status=status,
-        **timer.stats,
+        index_texts= [readability_content] if readability_content else [],
+        **timer.stats,  
    )
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -51,6 +51,8 @@ from .sql import (
    write_sql_link_details,
 )

+from ..search import search_backend_enabled, query_search_index
+
 ### Link filtering and checking

@enforce_types
@ -365,7 +367,7 @@ LINK_FILTERS = {
 }

@enforce_types
-def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
+def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
    q_filter = Q()
    for pattern in filter_patterns:
        try:
@ -380,6 +382,31 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
            raise SystemExit(2)
    return snapshots.filter(q_filter)

+def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
+    if not search_backend_enabled():
+        stderr()
+        stderr(
+                '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',
+                color='red',
+            )
+        raise SystemExit(2)
+
+    qsearch = get_empty_snapshot_queryset()
+    for pattern in filter_patterns:
+        try:
+            qsearch |= query_search_index(pattern)
+        except:
+            raise SystemExit(2)
+    
+    return snapshots & qsearch
+
+@enforce_types
+def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
+    if filter_type != 'search':
+        return q_filter(snapshots, filter_patterns, filter_type)
+    else:
+        return search_filter(snapshots, filter_patterns, filter_type)
+

 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links without checking archive status or data directory validity"""
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@ -39,6 +39,7 @@ class ArchiveResult:
    status: str
    start_ts: datetime
    end_ts: datetime
+    index_texts: Union[List[str], None] = None
    schema: str = 'ArchiveResult'

    def __post_init__(self):
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -115,6 +115,7 @@ from .logging_util import (
    printable_dependency_version,
 )

+from .search import flush_search_index, index_links

 ALLOWED_IN_OUTPUT_DIR = {
    'lost+found',
@ -664,6 +665,7 @@ def remove(filter_str: Optional[str]=None,

    to_remove = snapshots.count()

+    flush_search_index(snapshots=snapshots)
    remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
    all_snapshots = load_main_index(out_dir=out_dir)
    log_removal_finished(all_snapshots.count(), to_remove)
@ -709,6 +711,7 @@ def update(resume: Optional[float]=None,
    if index_only:
        for link in all_links:
            write_link_details(link, out_dir=out_dir, skip_sql_index=True)
+        index_links(all_links, out_dir=out_dir)
        return all_links
        
    # Step 2: Run the archive methods for each link
--- a/archivebox/search/init.py
+++ b/archivebox/search/init.py
@ -0,0 +1,110 @@
+from typing import List, Union
+from pathlib import Path
+from importlib import import_module
+
+from django.db.models import QuerySet
+
+from archivebox.index.schema import Link
+from archivebox.util import enforce_types
+from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
+
+from .utils import get_indexable_content, log_index_started
+
+def indexing_enabled():
+    return USE_INDEXING_BACKEND
+
+def search_backend_enabled():
+    return USE_SEARCHING_BACKEND
+
+def get_backend():
+    return f'search.backends.{SEARCH_BACKEND_ENGINE}'
+
+def import_backend():
+    backend_string = get_backend()
+    try:
+        backend = import_module(backend_string)
+    except Exception as err:
+        raise Exception("Could not load '%s' as a backend: %s" % (backend_string, err))
+    return backend
+
+@enforce_types
+def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
+    if not indexing_enabled():
+        return
+
+    if not skip_text_index and texts:
+        setup_django(out_dir, check_db=True)
+        from core.models import Snapshot
+
+        snap = Snapshot.objects.filter(url=link.url).first()
+        backend = import_backend()
+        if snap:
+            try:
+                backend.index(snapshot_id=str(snap.id), texts=texts)
+            except Exception as err:
+                stderr()
+                stderr(
+                    f'[X] The search backend threw an exception={err}:',
+                color='red',
+                )
+
+@enforce_types
+def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
+    setup_django(out_dir, check_db=True)
+    from core.models import Snapshot
+
+    if search_backend_enabled():
+        backend = import_backend()
+        try:
+            snapshot_ids = backend.search(query)
+        except Exception as err:
+            stderr()
+            stderr(
+                    f'[X] The search backend threw an exception={err}:',
+                color='red',
+                )
+            raise
+        else:
+            # TODO preserve ordering from backend
+            qsearch = Snapshot.objects.filter(pk__in=snapshot_ids)
+            return qsearch
+    
+    return Snapshot.objects.none()
+
+@enforce_types
+def flush_search_index(snapshots: QuerySet):
+    if not indexing_enabled() or not snapshots:
+        return
+    backend = import_backend()
+    snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True))
+    try:
+        backend.flush(snapshot_ids)
+    except Exception as err:
+        stderr()
+        stderr(
+            f'[X] The search backend threw an exception={err}:',
+        color='red',
+        )
+
+@enforce_types
+def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
+    if not links:
+        return
+
+    setup_django(out_dir=out_dir, check_db=True)
+    from core.models import Snapshot, ArchiveResult
+
+    for link in links:
+        if snap := Snapshot.objects.filter(url=link.url).first():
+            results = ArchiveResult.objects.indexable().filter(snapshot=snap)
+            log_index_started(link.url)
+            try:
+                texts = get_indexable_content(results)
+            except Exception as err:
+                stderr()
+                stderr(
+                    f'[X] An Exception ocurred reading the indexable content={err}:',
+                    color='red',
+                    ) 
+            else:
+                write_search_index(link, texts, out_dir=out_dir)
--- a/archivebox/search/backends/init.py
+++ b/archivebox/search/backends/init.py
--- a/archivebox/search/backends/ripgrep.py
+++ b/archivebox/search/backends/ripgrep.py
@ -0,0 +1,47 @@
+import re
+from subprocess import run, PIPE, DEVNULL
+from typing import List, Generator
+
+from archivebox.config import setup_django, ARCHIVE_DIR
+from archivebox.util import enforce_types
+
+RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
+
+RG_ADD_TYPE = '--type-add'
+RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}"
+RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l)
+RG_REGEX_ARGUMENT = '-e'
+
+TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
+
+ts_regex =  re.compile(TIMESTAMP_REGEX)
+
+@enforce_types
+def index(snapshot_id: str, texts: List[str]):
+    return
+
+@enforce_types
+def flush(snapshot_ids: Generator[str, None, None]):
+    return
+
+@enforce_types
+def search(text: str) -> List[str]:
+    is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
+    if is_rg_installed.returncode:
+        raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
+
+    setup_django(check_db=True)
+    from core.models import Snapshot
+
+    rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
+    rg = run(rg_cmd, stdout=PIPE, stderr=PIPE, timeout=60)
+    file_paths = [p.decode() for p in rg.stdout.splitlines()]
+    timestamps = set()
+    for path in file_paths:
+        if ts := ts_regex.findall(path):
+            timestamps.add(ts[0])
+    
+    snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
+
+    return snap_ids
+
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@ -0,0 +1,28 @@
+from typing import List, Generator
+
+from sonic import IngestClient, SearchClient
+
+from archivebox.util import enforce_types
+from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
+
+MAX_SONIC_TEXT_LENGTH = 20000
+
+@enforce_types
+def index(snapshot_id: str, texts: List[str]):
+    with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
+        for text in texts:
+            chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
+            for chunk in chunks:
+                ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
+
+@enforce_types
+def search(text: str) -> List[str]:
+    with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:
+        snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text)
+    return snap_ids
+
+@enforce_types
+def flush(snapshot_ids: Generator[str, None, None]):
+    with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
+        for id in snapshot_ids:
+            ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id))
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@ -0,0 +1,44 @@
+from django.db.models import QuerySet
+
+from archivebox.util import enforce_types
+from archivebox.config import ANSI
+
+def log_index_started(url):
+    print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
+    print( )
+
+def get_file_result_content(res, extra_path, use_pwd=False):
+    if use_pwd: 
+        fpath = f'{res.pwd}/{res.output}'
+    else:
+        fpath = f'{res.output}'
+    
+    if extra_path:
+        fpath = f'{fpath}/{extra_path}'
+
+    with open(fpath, 'r') as file:
+        data = file.read()
+    if data:
+        return [data]
+    return []
+
+
+# This should be abstracted by a plugin interface for extractors
+@enforce_types
+def get_indexable_content(results: QuerySet):
+    if not results:
+        return []
+    # Only use the first method available
+    res, method = results.first(), results.first().extractor
+    if method not in ('readability', 'singlefile', 'dom', 'wget'):
+        return []
+    # This should come from a plugin interface
+
+    if method == 'readability':
+        return get_file_result_content(res, 'content.txt')
+    elif method == 'singlefile':
+        return get_file_result_content(res, '')
+    elif method == 'dom':
+        return get_file_result_content(res,'',use_pwd=True)
+    elif method == 'wget':
+        return get_file_result_content(res,'',use_pwd=True)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -23,6 +23,7 @@ services:
            - SHOW_PROGRESS=False
        volumes:
            - ./data:/data
+    


    # Optional Addons: tweak these examples as needed for your specific use case
@ -73,3 +74,14 @@ services:
    #     volumes:
    #         ./data:/archivebox
    #         ./data/wayback:/webarchive
+
+    # Example: Run sonic search backend
+    # sonic:
+    #    image: valeriansaliou/sonic:v1.3.0    
+    #    ports:
+    #        - 1491:1491
+    #    environment:
+    #        - SEARCH_BACKEND_PASSWORD=SecretPassword
+    #    volumes:
+    #        - ./etc/sonic/config.cfg:/etc/sonic.cfg
+    #        - ./data:/var/lib/sonic/store/
--- a/etc/sonic/config.cfg
+++ b/etc/sonic/config.cfg
@ -0,0 +1,66 @@
+# Sonic
+# Fast, lightweight and schema-less search backend
+# Configuration file
+# Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg
+
+
+[server]
+
+log_level = "debug"
+
+
+[channel]
+
+inet = "0.0.0.0:1491"
+tcp_timeout = 300
+
+auth_password = "${env.SEARCH_BACKEND_PASSWORD}"
+
+[channel.search]
+
+query_limit_default = 65535
+query_limit_maximum = 65535
+query_alternates_try = 10
+
+suggest_limit_default = 5
+suggest_limit_maximum = 20
+
+
+[store]
+
+[store.kv]
+
+path = "/var/lib/sonic/store/kv/"
+
+retain_word_objects = 100000
+
+[store.kv.pool]
+
+inactive_after = 1800
+
+[store.kv.database]
+
+flush_after = 900
+
+compress = true
+parallelism = 2
+max_files = 100
+max_compactions = 1
+max_flushes = 1
+write_buffer = 16384
+write_ahead_log = true
+
+[store.fst]
+
+path = "/var/lib/sonic/store/fst/"
+
+[store.fst.pool]
+
+inactive_after = 300
+
+[store.fst.graph]
+
+consolidate_after = 180
+
+max_size = 2048
+max_words = 250000