Create scripts directory - geminispace.info

commit b64dfafc563cef3d670e4bbbbcbdd8775d5855ce
parent 96b7b661fb2c5f5eed02e4475467ef61a1b4c251
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 23 Jul 2020 06:54:55 -0400

Create scripts directory

Diffstat:
M pyproject.toml  | 2 --
A scripts/add_domains.py  | 33 +++++++++++++++++++++++++++++++++
A scripts/add_none_charset.py  | 35 +++++++++++++++++++++++++++++++++++
A scripts/add_normalized_url.py  | 16 ++++++++++++++++
R gus/remove_domain.py -> scripts/remove_domain.py  | 0 
A scripts/search_index.py  | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

6 files changed, 175 insertions(+), 2 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,8 +23,6 @@ ipdb = "^0.13.3"
 [tool.poetry.scripts]
 crawl = "gus.crawl:main"
 build_index = "gus.build_index:main"
-search_index = "gus.search_index:main"
-remove_domain = "gus.remove_domain:main"
 serve = "serve.main:main"
 statistics = "gus.lib.index_statistics:run_index_statistics"
 
diff --git a/scripts/add_domains.py b/scripts/add_domains.py
@@ -0,0 +1,33 @@
+from whoosh.fields import TEXT
+from whoosh.index import open_dir
+from whoosh.query import Every
+
+from gus.lib.gemini import GeminiResource
+from gus.lib.whoosh_extensions import UrlAnalyzer
+
+def main():
+    ix = open_dir("index")
+
+    with ix.writer() as writer:
+        writer.add_field("domain", TEXT(analyzer=UrlAnalyzer()))
+
+    with ix.searcher() as searcher:
+        query = Every()
+        results = searcher.search(query, limit=None)
+        for result in results:
+            domain = GeminiResource(result["url"]).normalized_host
+            print(domain)
+            with ix.writer() as writer:
+                writer.delete_document(result.docnum)
+                writer.add_document(
+                    url          = result["url"],
+                    domain       = domain,
+                    content_type = result["content_type"],
+                    content      = result["content"] if "content" in result else None,
+                    prompt       = result["prompt"] if "prompt" in result else None,
+                    indexed_at   = result["indexed_at"],
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/add_none_charset.py b/scripts/add_none_charset.py
@@ -0,0 +1,35 @@
+from whoosh.fields import ID
+from whoosh.index import open_dir
+from whoosh.query import Every
+
+from gus.lib.gemini import GeminiResource
+from gus.lib.whoosh_extensions import UrlAnalyzer
+
+def main():
+    ix = open_dir("index")
+
+    with ix.searcher() as searcher:
+        query = Every()
+        results = searcher.search(query, limit=None)
+        count = 0
+        for result in results:
+            if "charset" not in result:
+                count += 1
+                with ix.writer() as writer:
+                    writer.delete_document(result.docnum)
+                    writer.add_document(
+                        url          = result["url"],
+                        fetchable_url= result["fetchable_url"],
+                        domain       = GeminiResource(result["url"]).normalized_host,
+                        content_type = result["content_type"],
+                        charset      = "none",
+                        content      = result["content"] if "content" in result else None,
+                        regex        = result["regex"] if "regex" in result else None,
+                        prompt       = result["prompt"] if "prompt" in result else None,
+                        indexed_at   = result["indexed_at"],
+                    )
+        print("{} documents updated.".format(count))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/add_normalized_url.py b/scripts/add_normalized_url.py
@@ -0,0 +1,16 @@
+from gus import constants
+from gus.lib.db_model import init_db, Page
+from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
+
+def main():
+    db = init_db(f"index.new/{constants.DB_FILENAME}")
+    for page in Page.select():
+        print(f"\nBefore: {page.normalized_url}")
+        page.normalized_url = GeminiResource(page.url).normalized_url
+        page.save()
+        print(f"After : {page.normalized_url}")
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gus/remove_domain.py b/scripts/remove_domain.py
diff --git a/scripts/search_index.py b/scripts/search_index.py
@@ -0,0 +1,91 @@
+import math
+import re
+import statistics
+import sys
+
+from whoosh.index import open_dir
+from whoosh.query import Every
+from whoosh.qparser import MultifieldParser
+from whoosh import highlight
+
+from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer
+from gus.lib.misc import bytes2human
+
+gemini_highlighter = highlight.Highlighter(
+    formatter=GeminiFormatter(),
+    fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80),
+    scorer=GeminiScorer(),
+    order=highlight.SCORE,
+)
+
+def get_highlight(result):
+    if "content" not in result:
+        return ""
+    if result["content_type"] not in ["text/plain", "text/gemini", "text/markdown"]:
+        return ""
+    return gemini_highlighter.highlight_hit(result, "content", top=1).replace(GeminiFormatter.between, "\n")
+
+
+def main():
+    ix = open_dir("index")
+    # ix.optimize()
+    # if len(sys.argv) < 2:
+    #     print("Please provide a search query...")
+    #     return
+
+    with ix.searcher() as searcher:
+        query = Every("size")
+        results = searcher.search(query, limit=9999999)
+        size_lists = {}
+        for result in results:
+            if result["content_type"] not in size_lists:
+                size_lists[result["content_type"]] = []
+            size_lists[result["content_type"]].append(result["size"])
+        for content_type, size_list in size_lists.items():
+            if len(size_list) < 16:
+                continue
+            print("\n# {} ({})".format(content_type, len(size_list)))
+            mean = bytes2human(statistics.mean(size_list), format="%(value).1f %(symbol)s")
+            median = bytes2human(statistics.median(size_list), format="%(value).1f %(symbol)s")
+            maximum = bytes2human(max(size_list), format="%(value).1f %(symbol)s")
+            print("Mean   : {:>8}".format(mean))
+            print("Median : {:>8}".format(median))
+            print("Max    : {:>8}".format(maximum))
+
+    # print("Searching index for: \"%s\"" % sys.argv[1])
+    # ix = open_dir("index")
+    # with ix.searcher() as searcher:
+    #     query = MultifieldParser(["content", "url"], ix.schema).parse(sys.argv[1])
+
+    #     results = searcher.search(query)
+    #     render_results(
+    #         sys.argv[1],
+    #         len(results),
+    #         [(
+    #             result["indexed_at"],
+    #             result.score,
+    #             result["url"],
+    #             get_highlight(result),
+    #         ) for result in results]
+    #     )
+
+
+def render_results(query, num_results, results):
+    print("          GUS")
+    print(" Gemini Universal Search")
+    print("==========================")
+    print("| You searched for: \"%s\"" % query)
+    print("| Number of hits: %s" % num_results)
+    print("==========================")
+    for i, result in enumerate(results):
+        if i > 0:
+            print()
+        print("=> %s" % result[2])
+        if len(result[3]) > 0:
+            print("%s" % result[3])
+    print("==========================")
+    print("Page 1 of %s (paging coming later)" % math.ceil(num_results / 10))
+
+
+if __name__ == "__main__":
+    main()

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	pyproject.toml	\|	2	--
A	scripts/add_domains.py	\|	33	+++++++++++++++++++++++++++++++++
A	scripts/add_none_charset.py	\|	35	+++++++++++++++++++++++++++++++++++
A	scripts/add_normalized_url.py	\|	16	++++++++++++++++
R	gus/remove_domain.py -> scripts/remove_domain.py	\|	0
A	scripts/search_index.py	\|	91	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++