geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 20a5cb896d0eee3d271565ce420f74a598e1f94b
parent d16c11de01f336034c8b4e70333c104b5c0f77a0
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sat, 16 May 2020 08:00:35 -0400

[crawl] [serve] Add more statistics

Diffstat:
M.gitignore | 1+
Mgus/crawl.py | 26++++++++++++++++++++++++--
Mgus/lib/index_statistics.py | 30++++++++++++++++++++----------
Mgus/serve.py | 18++++++++++--------
4 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -132,3 +132,4 @@ dmypy.json /seed-requests.txt /index-statistics.csv +/statistics.csv diff --git a/gus/crawl.py b/gus/crawl.py @@ -15,7 +15,7 @@ from whoosh.index import create_in, open_dir from whoosh.query import Every from whoosh.qparser import QueryParser -from gus.lib.index_statistics import compute_index_statistics, persist_index_statistics +from gus.lib.index_statistics import compute_index_statistics, persist_statistics from gus.lib.whoosh_extensions import UrlAnalyzer from gus.lib.url_helpers import normalize_gemini_url @@ -244,10 +244,14 @@ def crawl_url(url): # problem before getting a response print("ERROR : %s" % url) print("--------------------------") + crawl_statistics["broken_url_count"] += 1 elif r.status.startswith("3"): # redirect status print("REDIRECT : %s -> %s" % (url, r.url)) visited_urls.pop() + crawl_statistics["redirect_count"] += 1 + if is_nontrivial_redirect(url, r.url): + crawl_statistics["redirect_nontrivial_count"] += 1 crawl_url(r.url) elif r.status.startswith("1"): # input status @@ -276,6 +280,10 @@ def crawl_url(url): print("--------------------------") +def is_nontrivial_redirect(url, redirect_url): + return url.rstrip() != redirect_url.rstrip() + + def main(): create_index(INDEX_DIR) global visited_urls @@ -284,16 +292,30 @@ def main(): robot_file_map = {} global domain_hit_timings domain_hit_timings = {} + global crawl_statistics + crawl_statistics = { + # any redirect counts + "redirect_count": 0, + # more than just adding/removing trailing slash + "redirect_nontrivial_count": 0, + "broken_url_count": 0, + } for url in SEED_URLS: crawl_url(url) + index_statistics = compute_index_statistics("index") print("Page count: {}".format(index_statistics["page_count"])) print("Domain count: {}".format(index_statistics["domain_count"])) print("Domains: {}".format(index_statistics["domains"])) + + print("Redirect count: {}".format(crawl_statistics["redirect_count"])) + print("Nontrivial redirect count: {}".format(crawl_statistics["redirect_nontrivial_count"])) + print("Broken URL count: {}".format(crawl_statistics["broken_url_count"])) + print("\nContent Types:") for pair in index_statistics["content_type_frequencies"]: print(pair) - persist_index_statistics(index_statistics, "index-statistics.csv") + persist_statistics(index_statistics, crawl_statistics, "statistics.csv") if __name__ == "__main__": diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -69,40 +69,50 @@ def run_index_statistics(): # persist_index_statistics(index_statistics, "index-statistics.csv") -def persist_index_statistics(index_statistics, filename): +def persist_statistics(index_statistics, crawl_statistics, filename): with open(filename, "a") as f: - f.write(serialize_index_statistics_line(index_statistics)) + f.write(serialize_statistics_line(index_statistics, crawl_statistics)) -def serialize_index_statistics_line(index_statistics): - return "{:%Y-%m-%d},{},{},{},{}\n".format( +def serialize_statistics_line(index_statistics, crawl_statistics): + return "{:%Y-%m-%d},{},{},{},{},{},{},{}\n".format( index_statistics["index_modification_time"], index_statistics["page_count"], index_statistics["domain_count"], + crawl_statistics["redirect_count"], + crawl_statistics["redirect_nontrivial_count"], + crawl_statistics["broken_url_count"], "|".join(index_statistics["domains"]), "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]), ) -def load_last_index_statistics_from_file(filename): +def load_last_statistics_from_file(filename): with open(filename) as f: data = f.readlines() lastline = data[-1].strip() - index_statistics = deserialize_index_statistics_line(lastline) - return index_statistics + statistics = deserialize_statistics_line(lastline) + return statistics -def deserialize_index_statistics_line(line): +def deserialize_statistics_line(line): line_parts = line.split(",") index_modification_time = line_parts[0] page_count = line_parts[1] domain_count = line_parts[2] - domains = [domain for domain in line_parts[3].split("|")] - content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[4].split("|")] + redirect_count = line_parts[3] + redirect_nontrivial_count = line_parts[4] + broken_url_count = line_parts[5] + domains = [domain for domain in line_parts[6].split("|")] + content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[7].split("|")] + return { "index_modification_time": index_modification_time, "page_count": page_count, "domain_count": domain_count, + "redirect_count": redirect_count, + "redirect_nontrivial_count": redirect_nontrivial_count, + "broken_url_count": broken_url_count, "domains": domains, "content_type_frequencies": content_type_frequencies, } diff --git a/gus/serve.py b/gus/serve.py @@ -13,9 +13,9 @@ from jetforce import Response, Status from whoosh.index import open_dir from whoosh.qparser import MultifieldParser -from gus.lib.index_statistics import load_last_index_statistics_from_file +from gus.lib.index_statistics import load_last_statistics_from_file -last_index_statistics = load_last_index_statistics_from_file("index-statistics.csv") +last_statistics = load_last_statistics_from_file("statistics.csv") app = jetforce.JetforceApplication() @@ -44,17 +44,19 @@ def _render_index_statistics(): "", "## Overall", "", - "These figures are representative of the aggregate size of Geminispace at the time the current index was generated on {}.".format(last_index_statistics["index_modification_time"]), + "These figures are representative of the aggregate size of Geminispace at the time the current index was generated on {}.".format(last_statistics["index_modification_time"]), "", - "Page Count : {:>5}".format(last_index_statistics["page_count"]), - "Domain Count : {:>5}".format(last_index_statistics["domain_count"]), + "Page Count : {:>5}".format(last_statistics["page_count"]), + "Domain Count : {:>5}".format(last_statistics["domain_count"]), + "Redirect Count : {:>5}".format(last_statistics["redirect_count"]), + "Broken URL Count : {:>5}".format(last_statistics["broken_url_count"]), "", "## By Content Type", "", - "These figures are representative of the number of pages seen per content type at the time the current index was generated on {}.".format(last_index_statistics["index_modification_time"]), + "These figures are representative of the number of pages seen per content type at the time the current index was generated on {}.".format(last_statistics["index_modification_time"]), "", ] - for pair in last_index_statistics["content_type_frequencies"]: + for pair in last_statistics["content_type_frequencies"]: d.append("{:>5} - {}".format(pair[1], pair[0])) return d @@ -67,7 +69,7 @@ def _render_known_hosts(): "These are the hosts in Geminispace that GUS is aware of. Note that this list is auto-generated from the index, so if your host is not showing up here, it also won't have its content represented in GUS search results! There is a link at the bottom to add your host to the list of seeds for the next crawl of Geminispace though, after which your host should start showing up.", "", ] - for domain in last_index_statistics["domains"]: + for domain in last_statistics["domains"]: d.append("=> {}".format(domain)) return d