geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit bfcfec84e04a54c1c72df6858512ad6a238ec2d0
parent 682feb199142c56576c21692108828efbbc43879
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed,  3 Jun 2020 10:58:45 -0400

[crawl] Persist index & crawl statistics on non-destructive crawls

Also, add a flag to track which serialized statistics lines originated
from incremental crawls.

Diffstat:
Mgus/crawl.py | 2+-
Mgus/lib/index_statistics.py | 24+++++++++++++-----------
2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -393,8 +393,8 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): index_statistics = compute_index_statistics(index_dir) print_index_statistics(index_statistics, crawl_statistics) + persist_statistics(index_statistics, crawl_statistics, should_run_destructive, "statistics.csv") if should_run_destructive: - persist_statistics(index_statistics, crawl_statistics, "statistics.csv") # replace current index with new index shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True) shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT) diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -74,14 +74,15 @@ def run_index_statistics(): # persist_index_statistics(index_statistics, "index-statistics.csv") -def persist_statistics(index_statistics, crawl_statistics, filename): +def persist_statistics(index_statistics, crawl_statistics, was_destructive, filename): with open(filename, "a") as f: - f.write(serialize_statistics_line(index_statistics, crawl_statistics)) + f.write(serialize_statistics_line(index_statistics, crawl_statistics, was_destructive)) -def serialize_statistics_line(index_statistics, crawl_statistics): - return "{:%Y-%m-%d},{},{},{},{},{},{},{}\n".format( +def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive): + return "{:%Y-%m-%d},{},{},{},{},{},{},{},{}\n".format( index_statistics["index_modification_time"], + was_destructive, index_statistics["page_count"], index_statistics["domain_count"], crawl_statistics["redirect_count"], @@ -103,13 +104,14 @@ def load_last_statistics_from_file(filename): def deserialize_statistics_line(line): line_parts = line.split(",") index_modification_time = line_parts[0] - page_count = line_parts[1] - domain_count = line_parts[2] - redirect_count = line_parts[3] - redirect_nontrivial_count = line_parts[4] - broken_url_count = line_parts[5] - domains = [domain for domain in line_parts[6].split("|")] - content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[7].split("|")] + # discard line_parts[1], which is `was_destructive` + page_count = line_parts[2] + domain_count = line_parts[3] + redirect_count = line_parts[4] + redirect_nontrivial_count = line_parts[5] + broken_url_count = line_parts[6] + domains = [domain for domain in line_parts[7].split("|")] + content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[8].split("|")] return { "index_modification_time": index_modification_time,