geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

add_domains.py (1135B)


      1 from whoosh.fields import TEXT
      2 from whoosh.index import open_dir
      3 from whoosh.query import Every
      4 
      5 from gus.lib.gemini import GeminiResource
      6 from gus.lib.whoosh_extensions import UrlAnalyzer
      7 
      8 def main():
      9     ix = open_dir("index")
     10 
     11     with ix.writer() as writer:
     12         writer.add_field("domain", TEXT(analyzer=UrlAnalyzer()))
     13 
     14     with ix.searcher() as searcher:
     15         query = Every()
     16         results = searcher.search(query, limit=None)
     17         for result in results:
     18             domain = GeminiResource(result["url"]).normalized_host
     19             print(domain)
     20             with ix.writer() as writer:
     21                 writer.delete_document(result.docnum)
     22                 writer.add_document(
     23                     url          = result["url"],
     24                     domain       = domain,
     25                     content_type = result["content_type"],
     26                     content      = result["content"] if "content" in result else None,
     27                     prompt       = result["prompt"] if "prompt" in result else None,
     28                     indexed_at   = result["indexed_at"],
     29                 )
     30 
     31 
     32 if __name__ == "__main__":
     33     main()