search.py (4065B)
1 from urllib.parse import quote 2 import pathlib 3 import logging 4 5 import whoosh.qparser 6 import whoosh.highlight 7 from whoosh.analysis import FancyAnalyzer 8 from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC 9 from whoosh.filedb.filestore import FileStorage 10 11 from gus.lib.whoosh_extensions import UrlAnalyzer, GeminiFormatter, GeminiScorer 12 13 14 class Index: 15 def __init__(self, index_dir, should_run_destructive=False): 16 index_storage = FileStorage(index_dir, supports_mmap=False) 17 self._destructive = should_run_destructive 18 19 if self._destructive: 20 pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) 21 self._index = self._create(index_storage) 22 else: 23 self._index = index_storage.open_index() 24 25 self._searcher = self._index.searcher() 26 27 self._query_parser = whoosh.qparser.MultifieldParser( 28 ["content", "url", "prompt"], 29 self._index.schema, 30 group=whoosh.qparser.OrGroup.factory(0.99), 31 ) 32 self._query_parser.add_plugin(whoosh.qparser.RegexPlugin()) 33 self._query_parser.add_plugin(whoosh.qparser.GtLtPlugin()) 34 self._query_parser.remove_plugin_class(whoosh.qparser.WildcardPlugin) 35 self._query_parser.remove_plugin_class(whoosh.qparser.BoostPlugin) 36 self._query_parser.remove_plugin_class(whoosh.qparser.RangePlugin) 37 38 self._highlighter = whoosh.highlight.Highlighter( 39 formatter=GeminiFormatter(), 40 fragmenter=whoosh.highlight.ContextFragmenter(maxchars=160, surround=80), 41 scorer=GeminiScorer(), 42 order=whoosh.highlight.SCORE, 43 ) 44 45 self._writer = None 46 47 def _create(self, index_storage): 48 schema = Schema( 49 url_id=ID(unique=True, stored=True), 50 url=TEXT(field_boost=2.0, stored=True, analyzer=UrlAnalyzer()), 51 fetchable_url=STORED(), 52 domain=TEXT(analyzer=UrlAnalyzer()), 53 port=NUMERIC(int, 32, signed=False, stored=True), 54 content_type=TEXT(stored=True), 55 charset=ID(stored=True), 56 lang=ID(stored=True), 57 content=TEXT(analyzer=FancyAnalyzer(), spelling=True, stored=True), 58 prompt=TEXT(analyzer=FancyAnalyzer(), stored=True), 59 size=NUMERIC( 60 int, 61 # this means GUS will have problems indexing responses over ~2GB 62 32, 63 signed=False, 64 stored=True, 65 ), 66 backlink_count=NUMERIC( 67 int, 16, signed=False, stored=True, # num bits, so max value is 65k 68 ), 69 indexed_at=DATETIME(stored=True), 70 ) 71 return index_storage.create_index(schema) 72 73 def close(self): 74 if self._writer: 75 self._writer.commit() 76 self._index.close() 77 78 def _rolling_writer(self): 79 if not self._writer: 80 self._writer = self._index.writer(limitmb=1024, procs=3, multisegment=self._destructive) 81 return self._writer 82 83 def add_document(self, document): 84 self._rolling_writer().update_document(**document) 85 86 def delete_by_term(self, key, val): 87 self._rolling_writer().delete_by_term(key, val, searcher=None) 88 89 def parse_query(self, query): 90 return self._query_parser.parse(query) 91 92 def highlight(self, result): 93 if "content" in result: 94 return self._highlighter.highlight_hit(result, "content", top=1) 95 else: 96 return "" 97 98 def search(self, query, pagenr, pagelen=10): 99 return self._searcher.search_page(query, pagenr, pagelen) 100 101 def suggestions(self, query): 102 suggestions = [] 103 corrector = self._searcher.corrector("content") 104 for query_part in query.split(" "): 105 query_part_suggestions = corrector.suggest(query_part, limit=3) 106 suggestions.extend( 107 {"raw": suggestion, "quoted": quote(suggestion)} 108 for suggestion in query_part_suggestions 109 ) 110 return suggestions