geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit ffcba3395182761ec36834e566a2fac4c20df0c5
parent 4c2100ff326053ce883d499f83c603a7136be247
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 12 May 2020 07:23:09 -0400

[statistics] Refactor statistics objects to pass around dicts

Diffstat:
Mgus/crawl.py | 12++++++------
Mgus/lib/index_statistics.py | 38+++++++++++++++++++-------------------
2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -263,14 +263,14 @@ def main(): domain_hit_timings = {} for url in SEED_URLS: crawl_url(url) - page_count, domain_count, content_type_frequencies, domains = compute_index_statistics() - print("Page count: {}".format(page_count)) - print("Domain count: {}".format(domain_count)) - print("Domains: {}".format(domains)) + index_statistics = compute_index_statistics("index") + print("Page count: {}".format(index_statistics["page_count"])) + print("Domain count: {}".format(index_statistics["domain_count"])) + print("Domains: {}".format(index_statistics["domains"])) print("\nContent Types:") - for pair in content_type_frequencies: + for pair in index_statistics["content_type_frequencies"]: print(pair) - persist_index_statistics(page_count, domain_count, content_type_frequencies, domains) + persist_index_statistics(index_statistics, "index-statistics.csv") if __name__ == "__main__": diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -8,8 +8,8 @@ from whoosh.query import Every from gus.lib.url_helpers import normalize_gemini_url -def compute_index_statistics(): - ix = open_dir("index") +def compute_index_statistics(index_dir): + ix = open_dir(index_dir) content_types = None with ix.reader() as reader: # all_terms = reader.most_frequent_terms("content_type", number=999) @@ -36,28 +36,28 @@ def compute_index_statistics(): _, domain = normalize_gemini_url(result["url"]) domains.add(domain) domain_count = len(domains) - return page_count, domain_count, content_type_frequencies, domains + index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir)) + return { + "index_modification_time": index_modification_time, + "page_count": page_count, + "domain_count": domain_count, + "content_type_frequencies": content_type_frequencies, + "domains": domains, + } -def persist_index_statistics(page_count, domain_count, content_type_frequencies, domains): - index_modification_time = datetime.fromtimestamp(os.path.getmtime("index")) - with open("index-statistics.csv", "a") as index_statistics_file: - index_statistics_file.write(serialize_index_statistics_line( - index_modification_time, - page_count, - domain_count, - domains, - content_type_frequencies, - )) +def persist_index_statistics(index_statistics, filename): + with open(filename, "a") as f: + f.write(serialize_index_statistics_line(index_statistics)) -def serialize_index_statistics_line(index_modification_time, page_count, domain_count, content_type_frequencies, domains): +def serialize_index_statistics_line(index_statistics): return "{:%Y-%m-%d},{},{},{},{}\n".format( - index_modification_time, - page_count, - domain_count, - "|".join(domains), - "|".join("{}:{}".format(pair[0], pair[1]) for pair in content_type_frequencies), + index_statistics["index_modification_time"], + index_statistics["page_count"], + index_statistics["domain_count"], + "|".join(index_statistics["domains"]), + "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]), )