geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit b64dfafc563cef3d670e4bbbbcbdd8775d5855ce
parent 96b7b661fb2c5f5eed02e4475467ef61a1b4c251
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 23 Jul 2020 06:54:55 -0400

Create scripts directory

Diffstat:
Mpyproject.toml | 2--
Ascripts/add_domains.py | 33+++++++++++++++++++++++++++++++++
Ascripts/add_none_charset.py | 35+++++++++++++++++++++++++++++++++++
Ascripts/add_normalized_url.py | 16++++++++++++++++
Rgus/remove_domain.py -> scripts/remove_domain.py | 0
Ascripts/search_index.py | 91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml @@ -23,8 +23,6 @@ ipdb = "^0.13.3" [tool.poetry.scripts] crawl = "gus.crawl:main" build_index = "gus.build_index:main" -search_index = "gus.search_index:main" -remove_domain = "gus.remove_domain:main" serve = "serve.main:main" statistics = "gus.lib.index_statistics:run_index_statistics" diff --git a/scripts/add_domains.py b/scripts/add_domains.py @@ -0,0 +1,33 @@ +from whoosh.fields import TEXT +from whoosh.index import open_dir +from whoosh.query import Every + +from gus.lib.gemini import GeminiResource +from gus.lib.whoosh_extensions import UrlAnalyzer + +def main(): + ix = open_dir("index") + + with ix.writer() as writer: + writer.add_field("domain", TEXT(analyzer=UrlAnalyzer())) + + with ix.searcher() as searcher: + query = Every() + results = searcher.search(query, limit=None) + for result in results: + domain = GeminiResource(result["url"]).normalized_host + print(domain) + with ix.writer() as writer: + writer.delete_document(result.docnum) + writer.add_document( + url = result["url"], + domain = domain, + content_type = result["content_type"], + content = result["content"] if "content" in result else None, + prompt = result["prompt"] if "prompt" in result else None, + indexed_at = result["indexed_at"], + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/add_none_charset.py b/scripts/add_none_charset.py @@ -0,0 +1,35 @@ +from whoosh.fields import ID +from whoosh.index import open_dir +from whoosh.query import Every + +from gus.lib.gemini import GeminiResource +from gus.lib.whoosh_extensions import UrlAnalyzer + +def main(): + ix = open_dir("index") + + with ix.searcher() as searcher: + query = Every() + results = searcher.search(query, limit=None) + count = 0 + for result in results: + if "charset" not in result: + count += 1 + with ix.writer() as writer: + writer.delete_document(result.docnum) + writer.add_document( + url = result["url"], + fetchable_url= result["fetchable_url"], + domain = GeminiResource(result["url"]).normalized_host, + content_type = result["content_type"], + charset = "none", + content = result["content"] if "content" in result else None, + regex = result["regex"] if "regex" in result else None, + prompt = result["prompt"] if "prompt" in result else None, + indexed_at = result["indexed_at"], + ) + print("{} documents updated.".format(count)) + + +if __name__ == "__main__": + main() diff --git a/scripts/add_normalized_url.py b/scripts/add_normalized_url.py @@ -0,0 +1,16 @@ +from gus import constants +from gus.lib.db_model import init_db, Page +from gus.lib.gemini import GeminiResource, GeminiRobotFileParser + +def main(): + db = init_db(f"index.new/{constants.DB_FILENAME}") + for page in Page.select(): + print(f"\nBefore: {page.normalized_url}") + page.normalized_url = GeminiResource(page.url).normalized_url + page.save() + print(f"After : {page.normalized_url}") + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/gus/remove_domain.py b/scripts/remove_domain.py diff --git a/scripts/search_index.py b/scripts/search_index.py @@ -0,0 +1,91 @@ +import math +import re +import statistics +import sys + +from whoosh.index import open_dir +from whoosh.query import Every +from whoosh.qparser import MultifieldParser +from whoosh import highlight + +from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer +from gus.lib.misc import bytes2human + +gemini_highlighter = highlight.Highlighter( + formatter=GeminiFormatter(), + fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80), + scorer=GeminiScorer(), + order=highlight.SCORE, +) + +def get_highlight(result): + if "content" not in result: + return "" + if result["content_type"] not in ["text/plain", "text/gemini", "text/markdown"]: + return "" + return gemini_highlighter.highlight_hit(result, "content", top=1).replace(GeminiFormatter.between, "\n") + + +def main(): + ix = open_dir("index") + # ix.optimize() + # if len(sys.argv) < 2: + # print("Please provide a search query...") + # return + + with ix.searcher() as searcher: + query = Every("size") + results = searcher.search(query, limit=9999999) + size_lists = {} + for result in results: + if result["content_type"] not in size_lists: + size_lists[result["content_type"]] = [] + size_lists[result["content_type"]].append(result["size"]) + for content_type, size_list in size_lists.items(): + if len(size_list) < 16: + continue + print("\n# {} ({})".format(content_type, len(size_list))) + mean = bytes2human(statistics.mean(size_list), format="%(value).1f %(symbol)s") + median = bytes2human(statistics.median(size_list), format="%(value).1f %(symbol)s") + maximum = bytes2human(max(size_list), format="%(value).1f %(symbol)s") + print("Mean : {:>8}".format(mean)) + print("Median : {:>8}".format(median)) + print("Max : {:>8}".format(maximum)) + + # print("Searching index for: \"%s\"" % sys.argv[1]) + # ix = open_dir("index") + # with ix.searcher() as searcher: + # query = MultifieldParser(["content", "url"], ix.schema).parse(sys.argv[1]) + + # results = searcher.search(query) + # render_results( + # sys.argv[1], + # len(results), + # [( + # result["indexed_at"], + # result.score, + # result["url"], + # get_highlight(result), + # ) for result in results] + # ) + + +def render_results(query, num_results, results): + print(" GUS") + print(" Gemini Universal Search") + print("==========================") + print("| You searched for: \"%s\"" % query) + print("| Number of hits: %s" % num_results) + print("==========================") + for i, result in enumerate(results): + if i > 0: + print() + print("=> %s" % result[2]) + if len(result[3]) > 0: + print("%s" % result[3]) + print("==========================") + print("Page 1 of %s (paging coming later)" % math.ceil(num_results / 10)) + + +if __name__ == "__main__": + main()