commit b64dfafc563cef3d670e4bbbbcbdd8775d5855ce
parent 96b7b661fb2c5f5eed02e4475467ef61a1b4c251
Author: Natalie Pendragon <natpen@natpen.net>
Date: Thu, 23 Jul 2020 06:54:55 -0400
Create scripts directory
Diffstat:
6 files changed, 175 insertions(+), 2 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,8 +23,6 @@ ipdb = "^0.13.3"
[tool.poetry.scripts]
crawl = "gus.crawl:main"
build_index = "gus.build_index:main"
-search_index = "gus.search_index:main"
-remove_domain = "gus.remove_domain:main"
serve = "serve.main:main"
statistics = "gus.lib.index_statistics:run_index_statistics"
diff --git a/scripts/add_domains.py b/scripts/add_domains.py
@@ -0,0 +1,33 @@
+from whoosh.fields import TEXT
+from whoosh.index import open_dir
+from whoosh.query import Every
+
+from gus.lib.gemini import GeminiResource
+from gus.lib.whoosh_extensions import UrlAnalyzer
+
+def main():
+ ix = open_dir("index")
+
+ with ix.writer() as writer:
+ writer.add_field("domain", TEXT(analyzer=UrlAnalyzer()))
+
+ with ix.searcher() as searcher:
+ query = Every()
+ results = searcher.search(query, limit=None)
+ for result in results:
+ domain = GeminiResource(result["url"]).normalized_host
+ print(domain)
+ with ix.writer() as writer:
+ writer.delete_document(result.docnum)
+ writer.add_document(
+ url = result["url"],
+ domain = domain,
+ content_type = result["content_type"],
+ content = result["content"] if "content" in result else None,
+ prompt = result["prompt"] if "prompt" in result else None,
+ indexed_at = result["indexed_at"],
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/add_none_charset.py b/scripts/add_none_charset.py
@@ -0,0 +1,35 @@
+from whoosh.fields import ID
+from whoosh.index import open_dir
+from whoosh.query import Every
+
+from gus.lib.gemini import GeminiResource
+from gus.lib.whoosh_extensions import UrlAnalyzer
+
+def main():
+ ix = open_dir("index")
+
+ with ix.searcher() as searcher:
+ query = Every()
+ results = searcher.search(query, limit=None)
+ count = 0
+ for result in results:
+ if "charset" not in result:
+ count += 1
+ with ix.writer() as writer:
+ writer.delete_document(result.docnum)
+ writer.add_document(
+ url = result["url"],
+ fetchable_url= result["fetchable_url"],
+ domain = GeminiResource(result["url"]).normalized_host,
+ content_type = result["content_type"],
+ charset = "none",
+ content = result["content"] if "content" in result else None,
+ regex = result["regex"] if "regex" in result else None,
+ prompt = result["prompt"] if "prompt" in result else None,
+ indexed_at = result["indexed_at"],
+ )
+ print("{} documents updated.".format(count))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/add_normalized_url.py b/scripts/add_normalized_url.py
@@ -0,0 +1,16 @@
+from gus import constants
+from gus.lib.db_model import init_db, Page
+from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
+
+def main():
+ db = init_db(f"index.new/{constants.DB_FILENAME}")
+ for page in Page.select():
+ print(f"\nBefore: {page.normalized_url}")
+ page.normalized_url = GeminiResource(page.url).normalized_url
+ page.save()
+ print(f"After : {page.normalized_url}")
+ print("\nDone!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/gus/remove_domain.py b/scripts/remove_domain.py
diff --git a/scripts/search_index.py b/scripts/search_index.py
@@ -0,0 +1,91 @@
+import math
+import re
+import statistics
+import sys
+
+from whoosh.index import open_dir
+from whoosh.query import Every
+from whoosh.qparser import MultifieldParser
+from whoosh import highlight
+
+from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer
+from gus.lib.misc import bytes2human
+
+gemini_highlighter = highlight.Highlighter(
+ formatter=GeminiFormatter(),
+ fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80),
+ scorer=GeminiScorer(),
+ order=highlight.SCORE,
+)
+
+def get_highlight(result):
+ if "content" not in result:
+ return ""
+ if result["content_type"] not in ["text/plain", "text/gemini", "text/markdown"]:
+ return ""
+ return gemini_highlighter.highlight_hit(result, "content", top=1).replace(GeminiFormatter.between, "\n")
+
+
+def main():
+ ix = open_dir("index")
+ # ix.optimize()
+ # if len(sys.argv) < 2:
+ # print("Please provide a search query...")
+ # return
+
+ with ix.searcher() as searcher:
+ query = Every("size")
+ results = searcher.search(query, limit=9999999)
+ size_lists = {}
+ for result in results:
+ if result["content_type"] not in size_lists:
+ size_lists[result["content_type"]] = []
+ size_lists[result["content_type"]].append(result["size"])
+ for content_type, size_list in size_lists.items():
+ if len(size_list) < 16:
+ continue
+ print("\n# {} ({})".format(content_type, len(size_list)))
+ mean = bytes2human(statistics.mean(size_list), format="%(value).1f %(symbol)s")
+ median = bytes2human(statistics.median(size_list), format="%(value).1f %(symbol)s")
+ maximum = bytes2human(max(size_list), format="%(value).1f %(symbol)s")
+ print("Mean : {:>8}".format(mean))
+ print("Median : {:>8}".format(median))
+ print("Max : {:>8}".format(maximum))
+
+ # print("Searching index for: \"%s\"" % sys.argv[1])
+ # ix = open_dir("index")
+ # with ix.searcher() as searcher:
+ # query = MultifieldParser(["content", "url"], ix.schema).parse(sys.argv[1])
+
+ # results = searcher.search(query)
+ # render_results(
+ # sys.argv[1],
+ # len(results),
+ # [(
+ # result["indexed_at"],
+ # result.score,
+ # result["url"],
+ # get_highlight(result),
+ # ) for result in results]
+ # )
+
+
+def render_results(query, num_results, results):
+ print(" GUS")
+ print(" Gemini Universal Search")
+ print("==========================")
+ print("| You searched for: \"%s\"" % query)
+ print("| Number of hits: %s" % num_results)
+ print("==========================")
+ for i, result in enumerate(results):
+ if i > 0:
+ print()
+ print("=> %s" % result[2])
+ if len(result[3]) > 0:
+ print("%s" % result[3])
+ print("==========================")
+ print("Page 1 of %s (paging coming later)" % math.ceil(num_results / 10))
+
+
+if __name__ == "__main__":
+ main()