geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 586127b04bf8d0b70d8714f21d6be8a571ec01f4
parent c7ab03d8b5cd5bd98d0844c6a1c3a103c0a33809
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sat,  9 May 2020 17:34:52 -0400

[crawl] Compute and generate index statistics after each crawl

Diffstat:
Mgus/crawl.py | 58+++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta +import os import pathlib import re import shutil @@ -10,7 +11,9 @@ import gusmobile as gemini from whoosh.analysis import FancyAnalyzer from whoosh.fields import Schema, TEXT from whoosh.filedb.filestore import FileStorage -from whoosh.index import create_in +from whoosh.index import create_in, open_dir +from whoosh.query import Every +from whoosh.qparser import QueryParser from gus.whoosh_extensions import UrlAnalyzer @@ -243,6 +246,51 @@ def crawl_url(url): print("--------------------------") +def get_index_statistics(): + ix = open_dir("index") + content_types = None + with ix.reader() as reader: + # all_terms = reader.most_frequent_terms("content_type", number=999) + all_terms = reader.all_stored_fields() + content_types = set([term["content_type"] for term in all_terms]) + content_type_frequencies = [] + page_count = 0 + page_count = 0 + with ix.searcher() as searcher: + page_count = searcher.doc_count() + + # content type frequencies + parser = QueryParser("content_type", schema=ix.schema) + for content_type in content_types: + query = parser.parse(content_type) + results = searcher.search(query, limit=9999999) + content_type_frequencies.append((content_type, len(results))) + content_type_frequencies.sort(key=lambda pair: pair[1], reverse=True) + + query = Every("url") + results = searcher.search(query, limit=9999999) + domains = set() + for result in results: + _, domain = normalize_gemini_url(result["url"]) + domains.add(domain) + domain_count = len(domains) + return page_count, domain_count, content_type_frequencies, domains + + +def persist_index_statistics(page_count, domain_count, content_type_frequencies, domains): + index_modification_time = datetime.fromtimestamp(os.path.getmtime("index")) + with open("index-statistics.csv", "a") as index_statistics_file: + index_statistics_file.write( + "{:%Y-%m-%d},{},{},{},{}\n".format( + index_modification_time, + page_count, + domain_count, + "|".join(domains), + "|".join("{}:{}".format(pair[0], pair[1]) for pair in content_type_frequencies), + ) + ) + + def main(): create_index(INDEX_DIR) global visited_urls @@ -253,6 +301,14 @@ def main(): domain_hit_timings = {} for url in SEED_URLS: crawl_url(url) + page_count, domain_count, content_type_frequencies, domains = get_index_statistics() + print("Page count: {}".format(page_count)) + print("Domain count: {}".format(domain_count)) + print("Domains: {}".format(domains)) + print("\nContent Types:") + for pair in content_type_frequencies: + print(pair) + persist_index_statistics(page_count, domain_count, content_type_frequencies, domains) if __name__ == "__main__":