search_index.py (3103B)
1 import math 2 import re 3 import statistics 4 import sys 5 6 from whoosh.index import open_dir 7 from whoosh.query import Every 8 from whoosh.qparser import MultifieldParser 9 from whoosh import highlight 10 11 from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer 12 from gus.lib.misc import bytes2human 13 14 gemini_highlighter = highlight.Highlighter( 15 formatter=GeminiFormatter(), 16 fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80), 17 scorer=GeminiScorer(), 18 order=highlight.SCORE, 19 ) 20 21 def get_highlight(result): 22 if "content" not in result: 23 return "" 24 if result["content_type"] not in ["text/plain", "text/gemini", "text/markdown"]: 25 return "" 26 return gemini_highlighter.highlight_hit(result, "content", top=1).replace(GeminiFormatter.between, "\n") 27 28 29 def main(): 30 ix = open_dir("index") 31 # ix.optimize() 32 # if len(sys.argv) < 2: 33 # print("Please provide a search query...") 34 # return 35 36 with ix.searcher() as searcher: 37 query = Every("size") 38 results = searcher.search(query, limit=9999999) 39 size_lists = {} 40 for result in results: 41 if result["content_type"] not in size_lists: 42 size_lists[result["content_type"]] = [] 43 size_lists[result["content_type"]].append(result["size"]) 44 for content_type, size_list in size_lists.items(): 45 if len(size_list) < 16: 46 continue 47 print("\n# {} ({})".format(content_type, len(size_list))) 48 mean = bytes2human(statistics.mean(size_list), format="%(value).1f %(symbol)s") 49 median = bytes2human(statistics.median(size_list), format="%(value).1f %(symbol)s") 50 maximum = bytes2human(max(size_list), format="%(value).1f %(symbol)s") 51 print("Mean : {:>8}".format(mean)) 52 print("Median : {:>8}".format(median)) 53 print("Max : {:>8}".format(maximum)) 54 55 # print("Searching index for: \"%s\"" % sys.argv[1]) 56 # ix = open_dir("index") 57 # with ix.searcher() as searcher: 58 # query = MultifieldParser(["content", "url"], ix.schema).parse(sys.argv[1]) 59 60 # results = searcher.search(query) 61 # render_results( 62 # sys.argv[1], 63 # len(results), 64 # [( 65 # result["indexed_at"], 66 # result.score, 67 # result["url"], 68 # get_highlight(result), 69 # ) for result in results] 70 # ) 71 72 73 def render_results(query, num_results, results): 74 print(" GUS") 75 print(" Gemini Universal Search") 76 print("==========================") 77 print("| You searched for: \"%s\"" % query) 78 print("| Number of hits: %s" % num_results) 79 print("==========================") 80 for i, result in enumerate(results): 81 if i > 0: 82 print() 83 print("=> %s" % result[2]) 84 if len(result[3]) > 0: 85 print("%s" % result[3]) 86 print("==========================") 87 print("Page 1 of %s (paging coming later)" % math.ceil(num_results / 10)) 88 89 90 if __name__ == "__main__": 91 main()