commit 7093ad2aa8b298e8648f2c40381d1e288ef50605
parent 73a9aca69749a2ce6849e4aa328810dbf4cb397d
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 11 May 2020 13:56:48 -0400
[serve] Report out current index statistics
Diffstat:
4 files changed, 56 insertions(+), 5 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,4 @@ dmypy.json
/index/
/seed-requests.txt
+/index-statistics.csv
diff --git a/README.md b/README.md
@@ -45,8 +45,3 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
type of construct in the Gemini spec now, so I should probably
add a TODO to refactor the extract_gemini_links function to
exclude any links found within such a block.
-- **track aggregate content statistics**: it would be nice to track
- some statistics about Geminispace over time, like perhaps:
- - total number of domains
- - total number of pages (by content type too)
- - total number of words
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -59,3 +59,27 @@ def serialize_index_statistics_line(index_modification_time, page_count, domain_
"|".join(domains),
"|".join("{}:{}".format(pair[0], pair[1]) for pair in content_type_frequencies),
)
+
+
+def load_last_index_statistics_from_file(filename):
+ with open(filename) as f:
+ data = f.readlines()
+ lastline = data[-1].strip()
+ index_statistics = deserialize_index_statistics_line(lastline)
+ return index_statistics
+
+
+def deserialize_index_statistics_line(line):
+ line_parts = line.split(",")
+ index_modification_time = line_parts[0]
+ page_count = line_parts[1]
+ domain_count = line_parts[2]
+ domains = [domain for domain in line_parts[3].split("|")]
+ content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[4].split("|")]
+ return {
+ "index_modification_time": index_modification_time,
+ "page_count": page_count,
+ "domain_count": domain_count,
+ "domains": domains,
+ "content_type_frequencies": content_type_frequencies,
+ }
diff --git a/gus/serve.py b/gus/serve.py
@@ -12,7 +12,9 @@ from jetforce import Response, Status
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
+from gus.lib.index_statistics import load_last_index_statistics_from_file
+last_index_statistics = load_last_index_statistics_from_file("index-statistics.csv")
app = jetforce.JetforceApplication()
@@ -34,11 +36,32 @@ def _render_footer():
]
+def _render_index_statistics():
+ d = [
+ "",
+ "## Overall",
+ "These figures are reflective of the aggregate size of Geminispace",
+ "when the current index was generated on {}.".format(last_index_statistics["index_modification_time"]),
+ "",
+ "Page Count : {:>5}".format(last_index_statistics["page_count"]),
+ "Domain Count : {:>5}".format(last_index_statistics["domain_count"]),
+ "",
+ "## By Content Type",
+ "These figures represent the number of pages seen per content type",
+ "when the current index was generated on {}.".format(last_index_statistics["index_modification_time"]),
+ "",
+ ]
+ for pair in last_index_statistics["content_type_frequencies"]:
+ d.append("{:>5} - {}".format(pair[1], pair[0]))
+ return d
+
+
@app.route("")
def index(request):
data = _render_header()
data.extend([
"=> /about About GUS",
+ "=> /statistics GUS Statistics",
"=> gemini://gemini.circumlunar.space Gemini Project information"
])
data.extend(_render_footer())
@@ -93,6 +116,14 @@ def index(request):
return Response(Status.SUCCESS, "text/gemini", "\n".join(data))
+@app.route("/statistics")
+def index(request):
+ data = _render_header()
+ data.extend(_render_index_statistics())
+ data.extend(_render_footer())
+ return Response(Status.SUCCESS, "text/gemini", "\n".join(data))
+
+
def _search_index(query):
query = MultifieldParser(["content", "url", "prompt"], ix.schema).parse(query)
results = searcher.search(query)