add_domains.py (1135B)
1 from whoosh.fields import TEXT 2 from whoosh.index import open_dir 3 from whoosh.query import Every 4 5 from gus.lib.gemini import GeminiResource 6 from gus.lib.whoosh_extensions import UrlAnalyzer 7 8 def main(): 9 ix = open_dir("index") 10 11 with ix.writer() as writer: 12 writer.add_field("domain", TEXT(analyzer=UrlAnalyzer())) 13 14 with ix.searcher() as searcher: 15 query = Every() 16 results = searcher.search(query, limit=None) 17 for result in results: 18 domain = GeminiResource(result["url"]).normalized_host 19 print(domain) 20 with ix.writer() as writer: 21 writer.delete_document(result.docnum) 22 writer.add_document( 23 url = result["url"], 24 domain = domain, 25 content_type = result["content_type"], 26 content = result["content"] if "content" in result else None, 27 prompt = result["prompt"] if "prompt" in result else None, 28 indexed_at = result["indexed_at"], 29 ) 30 31 32 if __name__ == "__main__": 33 main()