geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 7093ad2aa8b298e8648f2c40381d1e288ef50605
parent 73a9aca69749a2ce6849e4aa328810dbf4cb397d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 11 May 2020 13:56:48 -0400

[serve] Report out current index statistics

Diffstat:
M.gitignore | 1+
MREADME.md | 5-----
Mgus/lib/index_statistics.py | 24++++++++++++++++++++++++
Mgus/serve.py | 31+++++++++++++++++++++++++++++++
4 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -131,3 +131,4 @@ dmypy.json /index/ /seed-requests.txt +/index-statistics.csv diff --git a/README.md b/README.md @@ -45,8 +45,3 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett type of construct in the Gemini spec now, so I should probably add a TODO to refactor the extract_gemini_links function to exclude any links found within such a block. -- **track aggregate content statistics**: it would be nice to track - some statistics about Geminispace over time, like perhaps: - - total number of domains - - total number of pages (by content type too) - - total number of words diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -59,3 +59,27 @@ def serialize_index_statistics_line(index_modification_time, page_count, domain_ "|".join(domains), "|".join("{}:{}".format(pair[0], pair[1]) for pair in content_type_frequencies), ) + + +def load_last_index_statistics_from_file(filename): + with open(filename) as f: + data = f.readlines() + lastline = data[-1].strip() + index_statistics = deserialize_index_statistics_line(lastline) + return index_statistics + + +def deserialize_index_statistics_line(line): + line_parts = line.split(",") + index_modification_time = line_parts[0] + page_count = line_parts[1] + domain_count = line_parts[2] + domains = [domain for domain in line_parts[3].split("|")] + content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[4].split("|")] + return { + "index_modification_time": index_modification_time, + "page_count": page_count, + "domain_count": domain_count, + "domains": domains, + "content_type_frequencies": content_type_frequencies, + } diff --git a/gus/serve.py b/gus/serve.py @@ -12,7 +12,9 @@ from jetforce import Response, Status from whoosh.index import open_dir from whoosh.qparser import MultifieldParser +from gus.lib.index_statistics import load_last_index_statistics_from_file +last_index_statistics = load_last_index_statistics_from_file("index-statistics.csv") app = jetforce.JetforceApplication() @@ -34,11 +36,32 @@ def _render_footer(): ] +def _render_index_statistics(): + d = [ + "", + "## Overall", + "These figures are reflective of the aggregate size of Geminispace", + "when the current index was generated on {}.".format(last_index_statistics["index_modification_time"]), + "", + "Page Count : {:>5}".format(last_index_statistics["page_count"]), + "Domain Count : {:>5}".format(last_index_statistics["domain_count"]), + "", + "## By Content Type", + "These figures represent the number of pages seen per content type", + "when the current index was generated on {}.".format(last_index_statistics["index_modification_time"]), + "", + ] + for pair in last_index_statistics["content_type_frequencies"]: + d.append("{:>5} - {}".format(pair[1], pair[0])) + return d + + @app.route("") def index(request): data = _render_header() data.extend([ "=> /about About GUS", + "=> /statistics GUS Statistics", "=> gemini://gemini.circumlunar.space Gemini Project information" ]) data.extend(_render_footer()) @@ -93,6 +116,14 @@ def index(request): return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) +@app.route("/statistics") +def index(request): + data = _render_header() + data.extend(_render_index_statistics()) + data.extend(_render_footer()) + return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) + + def _search_index(query): query = MultifieldParser(["content", "url", "prompt"], ix.schema).parse(query) results = searcher.search(query)