search.py - geminispace.info - gemini search engine

search.py (4065B)
      1 from urllib.parse import quote
      2 import pathlib
      3 import logging
      4 
      5 import whoosh.qparser
      6 import whoosh.highlight
      7 from whoosh.analysis import FancyAnalyzer
      8 from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC
      9 from whoosh.filedb.filestore import FileStorage
     10 
     11 from gus.lib.whoosh_extensions import UrlAnalyzer, GeminiFormatter, GeminiScorer
     12 
     13 
     14 class Index:
     15     def __init__(self, index_dir, should_run_destructive=False):
     16         index_storage = FileStorage(index_dir, supports_mmap=False)
     17         self._destructive = should_run_destructive
     18 
     19         if self._destructive:
     20             pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
     21             self._index = self._create(index_storage)
     22         else:
     23             self._index = index_storage.open_index()
     24 
     25         self._searcher = self._index.searcher()
     26 
     27         self._query_parser = whoosh.qparser.MultifieldParser(
     28             ["content", "url", "prompt"],
     29             self._index.schema,
     30             group=whoosh.qparser.OrGroup.factory(0.99),
     31         )
     32         self._query_parser.add_plugin(whoosh.qparser.RegexPlugin())
     33         self._query_parser.add_plugin(whoosh.qparser.GtLtPlugin())
     34         self._query_parser.remove_plugin_class(whoosh.qparser.WildcardPlugin)
     35         self._query_parser.remove_plugin_class(whoosh.qparser.BoostPlugin)
     36         self._query_parser.remove_plugin_class(whoosh.qparser.RangePlugin)
     37 
     38         self._highlighter = whoosh.highlight.Highlighter(
     39             formatter=GeminiFormatter(),
     40             fragmenter=whoosh.highlight.ContextFragmenter(maxchars=160, surround=80),
     41             scorer=GeminiScorer(),
     42             order=whoosh.highlight.SCORE,
     43         )
     44 
     45         self._writer = None
     46 
     47     def _create(self, index_storage):
     48         schema = Schema(
     49             url_id=ID(unique=True, stored=True),
     50             url=TEXT(field_boost=2.0, stored=True, analyzer=UrlAnalyzer()),
     51             fetchable_url=STORED(),
     52             domain=TEXT(analyzer=UrlAnalyzer()),
     53             port=NUMERIC(int, 32, signed=False, stored=True),
     54             content_type=TEXT(stored=True),
     55             charset=ID(stored=True),
     56             lang=ID(stored=True),
     57             content=TEXT(analyzer=FancyAnalyzer(), spelling=True, stored=True),
     58             prompt=TEXT(analyzer=FancyAnalyzer(), stored=True),
     59             size=NUMERIC(
     60                 int,
     61                 # this means GUS will have problems indexing responses over ~2GB
     62                 32,
     63                 signed=False,
     64                 stored=True,
     65             ),
     66             backlink_count=NUMERIC(
     67                 int, 16, signed=False, stored=True,  # num bits, so max value is 65k
     68             ),
     69             indexed_at=DATETIME(stored=True),
     70         )
     71         return index_storage.create_index(schema)
     72 
     73     def close(self):
     74         if self._writer:
     75             self._writer.commit()
     76         self._index.close()
     77 
     78     def _rolling_writer(self):
     79         if not self._writer:
     80             self._writer = self._index.writer(limitmb=1024, procs=3, multisegment=self._destructive)
     81         return self._writer
     82 
     83     def add_document(self, document):
     84         self._rolling_writer().update_document(**document)
     85 
     86     def delete_by_term(self, key, val):
     87         self._rolling_writer().delete_by_term(key, val, searcher=None)
     88 
     89     def parse_query(self, query):
     90         return self._query_parser.parse(query)
     91 
     92     def highlight(self, result):
     93         if "content" in result:
     94             return self._highlighter.highlight_hit(result, "content", top=1)
     95         else:
     96             return ""
     97 
     98     def search(self, query, pagenr, pagelen=10):
     99         return self._searcher.search_page(query, pagenr, pagelen)
    100 
    101     def suggestions(self, query):
    102         suggestions = []
    103         corrector = self._searcher.corrector("content")
    104         for query_part in query.split(" "):
    105             query_part_suggestions = corrector.suggest(query_part, limit=3)
    106             suggestions.extend(
    107                 {"raw": suggestion, "quoted": quote(suggestion)}
    108                 for suggestion in query_part_suggestions
    109             )
    110         return suggestions
	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE