commit 484ef909792e8227a33fd54a648cbe36147b6627
parent c1c29b4a7474161c63b7fb7172a6e4c35d0a817b
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 18 May 2020 09:12:31 -0400
[crawl] Use standardized print_index_statistics
Diffstat:
2 files changed, 13 insertions(+), 19 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -17,7 +17,7 @@ from whoosh.index import create_in, open_dir
from whoosh.query import Every
from whoosh.qparser import QueryParser
-from gus.lib.index_statistics import compute_index_statistics, persist_statistics
+from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics
from gus.lib.whoosh_extensions import UrlAnalyzer
from gus.lib.gemini import GeminiResource
@@ -317,18 +317,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
persist_visited_urls(visited_urls)
index_statistics = compute_index_statistics("index")
- # TODO: move this printing to the stats module
- print("Page count: {}".format(index_statistics["page_count"]))
- print("Domain count: {}".format(index_statistics["domain_count"]))
- print("Domains: {}".format(index_statistics["domains"]))
-
- print("Redirect count: {}".format(crawl_statistics["redirect_count"]))
- print("Nontrivial redirect count: {}".format(crawl_statistics["redirect_nontrivial_count"]))
- print("Broken URL count: {}".format(crawl_statistics["broken_url_count"]))
-
- print("\nContent Types:")
- for pair in index_statistics["content_type_frequencies"]:
- print(pair)
+ print_index_statistics(index_statistics, crawl_statistics)
persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -52,15 +52,20 @@ def compute_index_statistics(index_dir):
def print_index_statistics(index_statistics):
- print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"]))
- print("Page Count : {:>10}".format(index_statistics["page_count"]))
- print("Domain Count : {:>10}".format(index_statistics["domain_count"]))
+ print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"]))
+ print("Page Count : {:>6}".format(index_statistics["page_count"]))
+ print("Domain Count : {:>6}".format(index_statistics["domain_count"]))
+ print("Redirect count : {:>6}".format(crawl_statistics["redirect_count"]))
+ print("Nontrivial redirect count : {:>6}".format(crawl_statistics["redirect_nontrivial_count"]))
+ print("Broken URL count : {:>6}".format(crawl_statistics["broken_url_count"]))
+
+ print("Domains : {}".format(index_statistics["domains"]))
+ # for domain in index_statistics["domains"]:
+ # print("- {}".format(domain))
+
print("\nContent Types:")
for pair in index_statistics["content_type_frequencies"]:
print("{:>5} - {}".format(pair[1], pair[0]))
- print("\nDomains:") #.format(index_statistics["domains"]))
- for domain in index_statistics["domains"]:
- print("- {}".format(domain))
def run_index_statistics():