[crawl] [serve] Add more statistics - geminispace.info

commit 20a5cb896d0eee3d271565ce420f74a598e1f94b
parent d16c11de01f336034c8b4e70333c104b5c0f77a0
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sat, 16 May 2020 08:00:35 -0400

[crawl] [serve] Add more statistics

Diffstat:
M .gitignore  | 1 +
M gus/crawl.py  | 26 ++++++++++++++++++++++++--
M gus/lib/index_statistics.py  | 30 ++++++++++++++++++++----------
M gus/serve.py  | 18 ++++++++++--------

4 files changed, 55 insertions(+), 20 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -132,3 +132,4 @@ dmypy.json
 
 /seed-requests.txt
 /index-statistics.csv
+/statistics.csv
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -15,7 +15,7 @@ from whoosh.index import create_in, open_dir
 from whoosh.query import Every
 from whoosh.qparser import QueryParser
 
-from gus.lib.index_statistics import compute_index_statistics, persist_index_statistics
+from gus.lib.index_statistics import compute_index_statistics, persist_statistics
 from gus.lib.whoosh_extensions import UrlAnalyzer
 from gus.lib.url_helpers import normalize_gemini_url
 
@@ -244,10 +244,14 @@ def crawl_url(url):
         # problem before getting a response
         print("ERROR        : %s" % url)
         print("--------------------------")
+        crawl_statistics["broken_url_count"] += 1
     elif r.status.startswith("3"):
         # redirect status
         print("REDIRECT     : %s -> %s" % (url, r.url))
         visited_urls.pop()
+        crawl_statistics["redirect_count"] += 1
+        if is_nontrivial_redirect(url, r.url):
+            crawl_statistics["redirect_nontrivial_count"] += 1
         crawl_url(r.url)
     elif r.status.startswith("1"):
         # input status
@@ -276,6 +280,10 @@ def crawl_url(url):
         print("--------------------------")
 
 
+def is_nontrivial_redirect(url, redirect_url):
+    return url.rstrip() != redirect_url.rstrip()
+
+
 def main():
     create_index(INDEX_DIR)
     global visited_urls
@@ -284,16 +292,30 @@ def main():
     robot_file_map = {}
     global domain_hit_timings
     domain_hit_timings = {}
+    global crawl_statistics
+    crawl_statistics = {
+        # any redirect counts
+        "redirect_count": 0,
+        # more than just adding/removing trailing slash
+        "redirect_nontrivial_count": 0,
+        "broken_url_count": 0,
+    }
     for url in SEED_URLS:
         crawl_url(url)
+
     index_statistics = compute_index_statistics("index")
     print("Page count: {}".format(index_statistics["page_count"]))
     print("Domain count: {}".format(index_statistics["domain_count"]))
     print("Domains: {}".format(index_statistics["domains"]))
+
+    print("Redirect count: {}".format(crawl_statistics["redirect_count"]))
+    print("Nontrivial redirect count: {}".format(crawl_statistics["redirect_nontrivial_count"]))
+    print("Broken URL count: {}".format(crawl_statistics["broken_url_count"]))
+
     print("\nContent Types:")
     for pair in index_statistics["content_type_frequencies"]:
         print(pair)
-    persist_index_statistics(index_statistics, "index-statistics.csv")
+    persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
 
 
 if __name__ == "__main__":
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -69,40 +69,50 @@ def run_index_statistics():
     # persist_index_statistics(index_statistics, "index-statistics.csv")
 
 
-def persist_index_statistics(index_statistics, filename):
+def persist_statistics(index_statistics, crawl_statistics, filename):
     with open(filename, "a") as f:
-        f.write(serialize_index_statistics_line(index_statistics))
+        f.write(serialize_statistics_line(index_statistics, crawl_statistics))
 
 
-def serialize_index_statistics_line(index_statistics):
-    return "{:%Y-%m-%d},{},{},{},{}\n".format(
+def serialize_statistics_line(index_statistics, crawl_statistics):
+    return "{:%Y-%m-%d},{},{},{},{},{},{},{}\n".format(
         index_statistics["index_modification_time"],
         index_statistics["page_count"],
         index_statistics["domain_count"],
+        crawl_statistics["redirect_count"],
+        crawl_statistics["redirect_nontrivial_count"],
+        crawl_statistics["broken_url_count"],
         "|".join(index_statistics["domains"]),
         "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]),
     )
 
 
-def load_last_index_statistics_from_file(filename):
+def load_last_statistics_from_file(filename):
     with open(filename) as f:
         data = f.readlines()
     lastline = data[-1].strip()
-    index_statistics = deserialize_index_statistics_line(lastline)
-    return index_statistics
+    statistics = deserialize_statistics_line(lastline)
+    return statistics
 
 
-def deserialize_index_statistics_line(line):
+def deserialize_statistics_line(line):
     line_parts = line.split(",")
     index_modification_time = line_parts[0]
     page_count = line_parts[1]
     domain_count = line_parts[2]
-    domains = [domain for domain in line_parts[3].split("|")]
-    content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[4].split("|")]
+    redirect_count = line_parts[3]
+    redirect_nontrivial_count = line_parts[4]
+    broken_url_count = line_parts[5]
+    domains = [domain for domain in line_parts[6].split("|")]
+    content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[7].split("|")]
+
     return {
         "index_modification_time": index_modification_time,
         "page_count": page_count,
         "domain_count": domain_count,
+        "redirect_count": redirect_count,
+        "redirect_nontrivial_count": redirect_nontrivial_count,
+        "broken_url_count": broken_url_count,
         "domains": domains,
         "content_type_frequencies": content_type_frequencies,
     }
diff --git a/gus/serve.py b/gus/serve.py
@@ -13,9 +13,9 @@ from jetforce import Response, Status
 from whoosh.index import open_dir
 from whoosh.qparser import MultifieldParser
 
-from gus.lib.index_statistics import load_last_index_statistics_from_file
+from gus.lib.index_statistics import load_last_statistics_from_file
 
-last_index_statistics = load_last_index_statistics_from_file("index-statistics.csv")
+last_statistics = load_last_statistics_from_file("statistics.csv")
 app = jetforce.JetforceApplication()
 
 
@@ -44,17 +44,19 @@ def _render_index_statistics():
         "",
         "## Overall",
         "",
-        "These figures are representative of the aggregate size of Geminispace at the time the current index was generated on {}.".format(last_index_statistics["index_modification_time"]),
+        "These figures are representative of the aggregate size of Geminispace at the time the current index was generated on {}.".format(last_statistics["index_modification_time"]),
         "",
-        "Page Count   : {:>5}".format(last_index_statistics["page_count"]),
-        "Domain Count : {:>5}".format(last_index_statistics["domain_count"]),
+        "Page Count                       : {:>5}".format(last_statistics["page_count"]),
+        "Domain Count                     : {:>5}".format(last_statistics["domain_count"]),
+        "Redirect Count                   : {:>5}".format(last_statistics["redirect_count"]),
+        "Broken URL Count                 : {:>5}".format(last_statistics["broken_url_count"]),
         "",
         "## By Content Type",
         "",
-        "These figures are representative of the number of pages seen per content type at the time the current index was generated on {}.".format(last_index_statistics["index_modification_time"]),
+        "These figures are representative of the number of pages seen per content type at the time the current index was generated on {}.".format(last_statistics["index_modification_time"]),
         "",
     ]
-    for pair in last_index_statistics["content_type_frequencies"]:
+    for pair in last_statistics["content_type_frequencies"]:
         d.append("{:>5} - {}".format(pair[1], pair[0]))
     return d
 
@@ -67,7 +69,7 @@ def _render_known_hosts():
         "These are the hosts in Geminispace that GUS is aware of. Note that this list is auto-generated from the index, so if your host is not showing up here, it also won't have its content represented in GUS search results! There is a link at the bottom to add your host to the list of seeds for the next crawl of Geminispace though, after which your host should start showing up.",
         "",
     ]
-    for domain in last_index_statistics["domains"]:
+    for domain in last_statistics["domains"]:
         d.append("=> {}".format(domain))
     return d

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	.gitignore	\|	1	+
M	gus/crawl.py	\|	26	++++++++++++++++++++++++--
M	gus/lib/index_statistics.py	\|	30	++++++++++++++++++++----------
M	gus/serve.py	\|	18	++++++++++--------