geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

add_none_charset.py (1357B)


      1 from whoosh.fields import ID
      2 from whoosh.index import open_dir
      3 from whoosh.query import Every
      4 
      5 from gus.lib.gemini import GeminiResource
      6 from gus.lib.whoosh_extensions import UrlAnalyzer
      7 
      8 def main():
      9     ix = open_dir("index")
     10 
     11     with ix.searcher() as searcher:
     12         query = Every()
     13         results = searcher.search(query, limit=None)
     14         count = 0
     15         for result in results:
     16             if "charset" not in result:
     17                 count += 1
     18                 with ix.writer() as writer:
     19                     writer.delete_document(result.docnum)
     20                     writer.add_document(
     21                         url          = result["url"],
     22                         fetchable_url= result["fetchable_url"],
     23                         domain       = GeminiResource(result["url"]).normalized_host,
     24                         content_type = result["content_type"],
     25                         charset      = "none",
     26                         content      = result["content"] if "content" in result else None,
     27                         regex        = result["regex"] if "regex" in result else None,
     28                         prompt       = result["prompt"] if "prompt" in result else None,
     29                         indexed_at   = result["indexed_at"],
     30                     )
     31         print("{} documents updated.".format(count))
     32 
     33 
     34 if __name__ == "__main__":
     35     main()