add_none_charset.py (1357B)
1 from whoosh.fields import ID 2 from whoosh.index import open_dir 3 from whoosh.query import Every 4 5 from gus.lib.gemini import GeminiResource 6 from gus.lib.whoosh_extensions import UrlAnalyzer 7 8 def main(): 9 ix = open_dir("index") 10 11 with ix.searcher() as searcher: 12 query = Every() 13 results = searcher.search(query, limit=None) 14 count = 0 15 for result in results: 16 if "charset" not in result: 17 count += 1 18 with ix.writer() as writer: 19 writer.delete_document(result.docnum) 20 writer.add_document( 21 url = result["url"], 22 fetchable_url= result["fetchable_url"], 23 domain = GeminiResource(result["url"]).normalized_host, 24 content_type = result["content_type"], 25 charset = "none", 26 content = result["content"] if "content" in result else None, 27 regex = result["regex"] if "regex" in result else None, 28 prompt = result["prompt"] if "prompt" in result else None, 29 indexed_at = result["indexed_at"], 30 ) 31 print("{} documents updated.".format(count)) 32 33 34 if __name__ == "__main__": 35 main()