geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 484ef909792e8227a33fd54a648cbe36147b6627
parent c1c29b4a7474161c63b7fb7172a6e4c35d0a817b
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 18 May 2020 09:12:31 -0400

[crawl] Use standardized print_index_statistics

Diffstat:
Mgus/crawl.py | 15++-------------
Mgus/lib/index_statistics.py | 17+++++++++++------
2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -17,7 +17,7 @@ from whoosh.index import create_in, open_dir from whoosh.query import Every from whoosh.qparser import QueryParser -from gus.lib.index_statistics import compute_index_statistics, persist_statistics +from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics from gus.lib.whoosh_extensions import UrlAnalyzer from gus.lib.gemini import GeminiResource @@ -317,18 +317,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): persist_visited_urls(visited_urls) index_statistics = compute_index_statistics("index") - # TODO: move this printing to the stats module - print("Page count: {}".format(index_statistics["page_count"])) - print("Domain count: {}".format(index_statistics["domain_count"])) - print("Domains: {}".format(index_statistics["domains"])) - - print("Redirect count: {}".format(crawl_statistics["redirect_count"])) - print("Nontrivial redirect count: {}".format(crawl_statistics["redirect_nontrivial_count"])) - print("Broken URL count: {}".format(crawl_statistics["broken_url_count"])) - - print("\nContent Types:") - for pair in index_statistics["content_type_frequencies"]: - print(pair) + print_index_statistics(index_statistics, crawl_statistics) persist_statistics(index_statistics, crawl_statistics, "statistics.csv") diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -52,15 +52,20 @@ def compute_index_statistics(index_dir): def print_index_statistics(index_statistics): - print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"])) - print("Page Count : {:>10}".format(index_statistics["page_count"])) - print("Domain Count : {:>10}".format(index_statistics["domain_count"])) + print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"])) + print("Page Count : {:>6}".format(index_statistics["page_count"])) + print("Domain Count : {:>6}".format(index_statistics["domain_count"])) + print("Redirect count : {:>6}".format(crawl_statistics["redirect_count"])) + print("Nontrivial redirect count : {:>6}".format(crawl_statistics["redirect_nontrivial_count"])) + print("Broken URL count : {:>6}".format(crawl_statistics["broken_url_count"])) + + print("Domains : {}".format(index_statistics["domains"])) + # for domain in index_statistics["domains"]: + # print("- {}".format(domain)) + print("\nContent Types:") for pair in index_statistics["content_type_frequencies"]: print("{:>5} - {}".format(pair[1], pair[0])) - print("\nDomains:") #.format(index_statistics["domains"])) - for domain in index_statistics["domains"]: - print("- {}".format(domain)) def run_index_statistics():