commit ffcba3395182761ec36834e566a2fac4c20df0c5
parent 4c2100ff326053ce883d499f83c603a7136be247
Author: Natalie Pendragon <natpen@natpen.net>
Date: Tue, 12 May 2020 07:23:09 -0400
[statistics] Refactor statistics objects to pass around dicts
Diffstat:
2 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -263,14 +263,14 @@ def main():
domain_hit_timings = {}
for url in SEED_URLS:
crawl_url(url)
- page_count, domain_count, content_type_frequencies, domains = compute_index_statistics()
- print("Page count: {}".format(page_count))
- print("Domain count: {}".format(domain_count))
- print("Domains: {}".format(domains))
+ index_statistics = compute_index_statistics("index")
+ print("Page count: {}".format(index_statistics["page_count"]))
+ print("Domain count: {}".format(index_statistics["domain_count"]))
+ print("Domains: {}".format(index_statistics["domains"]))
print("\nContent Types:")
- for pair in content_type_frequencies:
+ for pair in index_statistics["content_type_frequencies"]:
print(pair)
- persist_index_statistics(page_count, domain_count, content_type_frequencies, domains)
+ persist_index_statistics(index_statistics, "index-statistics.csv")
if __name__ == "__main__":
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -8,8 +8,8 @@ from whoosh.query import Every
from gus.lib.url_helpers import normalize_gemini_url
-def compute_index_statistics():
- ix = open_dir("index")
+def compute_index_statistics(index_dir):
+ ix = open_dir(index_dir)
content_types = None
with ix.reader() as reader:
# all_terms = reader.most_frequent_terms("content_type", number=999)
@@ -36,28 +36,28 @@ def compute_index_statistics():
_, domain = normalize_gemini_url(result["url"])
domains.add(domain)
domain_count = len(domains)
- return page_count, domain_count, content_type_frequencies, domains
+ index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir))
+ return {
+ "index_modification_time": index_modification_time,
+ "page_count": page_count,
+ "domain_count": domain_count,
+ "content_type_frequencies": content_type_frequencies,
+ "domains": domains,
+ }
-def persist_index_statistics(page_count, domain_count, content_type_frequencies, domains):
- index_modification_time = datetime.fromtimestamp(os.path.getmtime("index"))
- with open("index-statistics.csv", "a") as index_statistics_file:
- index_statistics_file.write(serialize_index_statistics_line(
- index_modification_time,
- page_count,
- domain_count,
- domains,
- content_type_frequencies,
- ))
+def persist_index_statistics(index_statistics, filename):
+ with open(filename, "a") as f:
+ f.write(serialize_index_statistics_line(index_statistics))
-def serialize_index_statistics_line(index_modification_time, page_count, domain_count, content_type_frequencies, domains):
+def serialize_index_statistics_line(index_statistics):
return "{:%Y-%m-%d},{},{},{},{}\n".format(
- index_modification_time,
- page_count,
- domain_count,
- "|".join(domains),
- "|".join("{}:{}".format(pair[0], pair[1]) for pair in content_type_frequencies),
+ index_statistics["index_modification_time"],
+ index_statistics["page_count"],
+ index_statistics["domain_count"],
+ "|".join(index_statistics["domains"]),
+ "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]),
)