geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

search_index.py (3103B)


      1 import math
      2 import re
      3 import statistics
      4 import sys
      5 
      6 from whoosh.index import open_dir
      7 from whoosh.query import Every
      8 from whoosh.qparser import MultifieldParser
      9 from whoosh import highlight
     10 
     11 from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer
     12 from gus.lib.misc import bytes2human
     13 
     14 gemini_highlighter = highlight.Highlighter(
     15     formatter=GeminiFormatter(),
     16     fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80),
     17     scorer=GeminiScorer(),
     18     order=highlight.SCORE,
     19 )
     20 
     21 def get_highlight(result):
     22     if "content" not in result:
     23         return ""
     24     if result["content_type"] not in ["text/plain", "text/gemini", "text/markdown"]:
     25         return ""
     26     return gemini_highlighter.highlight_hit(result, "content", top=1).replace(GeminiFormatter.between, "\n")
     27 
     28 
     29 def main():
     30     ix = open_dir("index")
     31     # ix.optimize()
     32     # if len(sys.argv) < 2:
     33     #     print("Please provide a search query...")
     34     #     return
     35 
     36     with ix.searcher() as searcher:
     37         query = Every("size")
     38         results = searcher.search(query, limit=9999999)
     39         size_lists = {}
     40         for result in results:
     41             if result["content_type"] not in size_lists:
     42                 size_lists[result["content_type"]] = []
     43             size_lists[result["content_type"]].append(result["size"])
     44         for content_type, size_list in size_lists.items():
     45             if len(size_list) < 16:
     46                 continue
     47             print("\n# {} ({})".format(content_type, len(size_list)))
     48             mean = bytes2human(statistics.mean(size_list), format="%(value).1f %(symbol)s")
     49             median = bytes2human(statistics.median(size_list), format="%(value).1f %(symbol)s")
     50             maximum = bytes2human(max(size_list), format="%(value).1f %(symbol)s")
     51             print("Mean   : {:>8}".format(mean))
     52             print("Median : {:>8}".format(median))
     53             print("Max    : {:>8}".format(maximum))
     54 
     55     # print("Searching index for: \"%s\"" % sys.argv[1])
     56     # ix = open_dir("index")
     57     # with ix.searcher() as searcher:
     58     #     query = MultifieldParser(["content", "url"], ix.schema).parse(sys.argv[1])
     59 
     60     #     results = searcher.search(query)
     61     #     render_results(
     62     #         sys.argv[1],
     63     #         len(results),
     64     #         [(
     65     #             result["indexed_at"],
     66     #             result.score,
     67     #             result["url"],
     68     #             get_highlight(result),
     69     #         ) for result in results]
     70     #     )
     71 
     72 
     73 def render_results(query, num_results, results):
     74     print("          GUS")
     75     print(" Gemini Universal Search")
     76     print("==========================")
     77     print("| You searched for: \"%s\"" % query)
     78     print("| Number of hits: %s" % num_results)
     79     print("==========================")
     80     for i, result in enumerate(results):
     81         if i > 0:
     82             print()
     83         print("=> %s" % result[2])
     84         if len(result[3]) > 0:
     85             print("%s" % result[3])
     86     print("==========================")
     87     print("Page 1 of %s (paging coming later)" % math.ceil(num_results / 10))
     88 
     89 
     90 if __name__ == "__main__":
     91     main()