[serve] Report out current index statistics - geminispace.info

commit 7093ad2aa8b298e8648f2c40381d1e288ef50605
parent 73a9aca69749a2ce6849e4aa328810dbf4cb397d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 11 May 2020 13:56:48 -0400

[serve] Report out current index statistics

Diffstat:
M .gitignore  | 1 +
M README.md  | 5 -----
M gus/lib/index_statistics.py  | 24 ++++++++++++++++++++++++
M gus/serve.py  | 31 +++++++++++++++++++++++++++++++

4 files changed, 56 insertions(+), 5 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,4 @@ dmypy.json
 /index/
 
 /seed-requests.txt
+/index-statistics.csv
diff --git a/README.md b/README.md
@@ -45,8 +45,3 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
   type of construct in the Gemini spec now, so I should probably
   add a TODO to refactor the extract_gemini_links function to
   exclude any links found within such a block.
-- **track aggregate content statistics**: it would be nice to track
-  some statistics about Geminispace over time, like perhaps:
-  - total number of domains
-  - total number of pages (by content type too)
-  - total number of words
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -59,3 +59,27 @@ def serialize_index_statistics_line(index_modification_time, page_count, domain_
         "|".join(domains),
         "|".join("{}:{}".format(pair[0], pair[1]) for pair in content_type_frequencies),
     )
+
+
+def load_last_index_statistics_from_file(filename):
+    with open(filename) as f:
+        data = f.readlines()
+    lastline = data[-1].strip()
+    index_statistics = deserialize_index_statistics_line(lastline)
+    return index_statistics
+
+
+def deserialize_index_statistics_line(line):
+    line_parts = line.split(",")
+    index_modification_time = line_parts[0]
+    page_count = line_parts[1]
+    domain_count = line_parts[2]
+    domains = [domain for domain in line_parts[3].split("|")]
+    content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[4].split("|")]
+    return {
+        "index_modification_time": index_modification_time,
+        "page_count": page_count,
+        "domain_count": domain_count,
+        "domains": domains,
+        "content_type_frequencies": content_type_frequencies,
+    }
diff --git a/gus/serve.py b/gus/serve.py
@@ -12,7 +12,9 @@ from jetforce import Response, Status
 from whoosh.index import open_dir
 from whoosh.qparser import MultifieldParser
 
+from gus.lib.index_statistics import load_last_index_statistics_from_file
 
+last_index_statistics = load_last_index_statistics_from_file("index-statistics.csv")
 app = jetforce.JetforceApplication()
 
 
@@ -34,11 +36,32 @@ def _render_footer():
     ]
 
 
+def _render_index_statistics():
+    d = [
+        "",
+        "## Overall",
+        "These figures are reflective of the aggregate size of Geminispace",
+        "when the current index was generated on {}.".format(last_index_statistics["index_modification_time"]),
+        "",
+        "Page Count   : {:>5}".format(last_index_statistics["page_count"]),
+        "Domain Count : {:>5}".format(last_index_statistics["domain_count"]),
+        "",
+        "## By Content Type",
+        "These figures represent the number of pages seen per content type",
+        "when the current index was generated on {}.".format(last_index_statistics["index_modification_time"]),
+        "",
+    ]
+    for pair in last_index_statistics["content_type_frequencies"]:
+        d.append("{:>5} - {}".format(pair[1], pair[0]))
+    return d
+
+
 @app.route("")
 def index(request):
     data = _render_header()
     data.extend([
         "=> /about About GUS",
+        "=> /statistics GUS Statistics",
         "=> gemini://gemini.circumlunar.space Gemini Project information"
     ])
     data.extend(_render_footer())
@@ -93,6 +116,14 @@ def index(request):
     return Response(Status.SUCCESS, "text/gemini", "\n".join(data))
 
 
+@app.route("/statistics")
+def index(request):
+    data = _render_header()
+    data.extend(_render_index_statistics())
+    data.extend(_render_footer())
+    return Response(Status.SUCCESS, "text/gemini", "\n".join(data))
+
+
 def _search_index(query):
     query = MultifieldParser(["content", "url", "prompt"], ix.schema).parse(query)
     results = searcher.search(query)

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	.gitignore	\|	1	+
M	README.md	\|	5	-----
M	gus/lib/index_statistics.py	\|	24	++++++++++++++++++++++++
M	gus/serve.py	\|	31	+++++++++++++++++++++++++++++++