commit 586127b04bf8d0b70d8714f21d6be8a571ec01f4
parent c7ab03d8b5cd5bd98d0844c6a1c3a103c0a33809
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sat, 9 May 2020 17:34:52 -0400
[crawl] Compute and generate index statistics after each crawl
Diffstat:
M | gus/crawl.py | | | 58 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- |
1 file changed, 57 insertions(+), 1 deletion(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -1,4 +1,5 @@
from datetime import datetime, timedelta
+import os
import pathlib
import re
import shutil
@@ -10,7 +11,9 @@ import gusmobile as gemini
from whoosh.analysis import FancyAnalyzer
from whoosh.fields import Schema, TEXT
from whoosh.filedb.filestore import FileStorage
-from whoosh.index import create_in
+from whoosh.index import create_in, open_dir
+from whoosh.query import Every
+from whoosh.qparser import QueryParser
from gus.whoosh_extensions import UrlAnalyzer
@@ -243,6 +246,51 @@ def crawl_url(url):
print("--------------------------")
+def get_index_statistics():
+ ix = open_dir("index")
+ content_types = None
+ with ix.reader() as reader:
+ # all_terms = reader.most_frequent_terms("content_type", number=999)
+ all_terms = reader.all_stored_fields()
+ content_types = set([term["content_type"] for term in all_terms])
+ content_type_frequencies = []
+ page_count = 0
+ page_count = 0
+ with ix.searcher() as searcher:
+ page_count = searcher.doc_count()
+
+ # content type frequencies
+ parser = QueryParser("content_type", schema=ix.schema)
+ for content_type in content_types:
+ query = parser.parse(content_type)
+ results = searcher.search(query, limit=9999999)
+ content_type_frequencies.append((content_type, len(results)))
+ content_type_frequencies.sort(key=lambda pair: pair[1], reverse=True)
+
+ query = Every("url")
+ results = searcher.search(query, limit=9999999)
+ domains = set()
+ for result in results:
+ _, domain = normalize_gemini_url(result["url"])
+ domains.add(domain)
+ domain_count = len(domains)
+ return page_count, domain_count, content_type_frequencies, domains
+
+
+def persist_index_statistics(page_count, domain_count, content_type_frequencies, domains):
+ index_modification_time = datetime.fromtimestamp(os.path.getmtime("index"))
+ with open("index-statistics.csv", "a") as index_statistics_file:
+ index_statistics_file.write(
+ "{:%Y-%m-%d},{},{},{},{}\n".format(
+ index_modification_time,
+ page_count,
+ domain_count,
+ "|".join(domains),
+ "|".join("{}:{}".format(pair[0], pair[1]) for pair in content_type_frequencies),
+ )
+ )
+
+
def main():
create_index(INDEX_DIR)
global visited_urls
@@ -253,6 +301,14 @@ def main():
domain_hit_timings = {}
for url in SEED_URLS:
crawl_url(url)
+ page_count, domain_count, content_type_frequencies, domains = get_index_statistics()
+ print("Page count: {}".format(page_count))
+ print("Domain count: {}".format(domain_count))
+ print("Domains: {}".format(domains))
+ print("\nContent Types:")
+ for pair in content_type_frequencies:
+ print(pair)
+ persist_index_statistics(page_count, domain_count, content_type_frequencies, domains)
if __name__ == "__main__":