commit bfcfec84e04a54c1c72df6858512ad6a238ec2d0
parent 682feb199142c56576c21692108828efbbc43879
Author: Natalie Pendragon <natpen@natpen.net>
Date: Wed, 3 Jun 2020 10:58:45 -0400
[crawl] Persist index & crawl statistics on non-destructive crawls
Also, add a flag to track which serialized statistics lines originated
from incremental crawls.
Diffstat:
2 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -393,8 +393,8 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
index_statistics = compute_index_statistics(index_dir)
print_index_statistics(index_statistics, crawl_statistics)
+ persist_statistics(index_statistics, crawl_statistics, should_run_destructive, "statistics.csv")
if should_run_destructive:
- persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
# replace current index with new index
shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True)
shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT)
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -74,14 +74,15 @@ def run_index_statistics():
# persist_index_statistics(index_statistics, "index-statistics.csv")
-def persist_statistics(index_statistics, crawl_statistics, filename):
+def persist_statistics(index_statistics, crawl_statistics, was_destructive, filename):
with open(filename, "a") as f:
- f.write(serialize_statistics_line(index_statistics, crawl_statistics))
+ f.write(serialize_statistics_line(index_statistics, crawl_statistics, was_destructive))
-def serialize_statistics_line(index_statistics, crawl_statistics):
- return "{:%Y-%m-%d},{},{},{},{},{},{},{}\n".format(
+def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive):
+ return "{:%Y-%m-%d},{},{},{},{},{},{},{},{}\n".format(
index_statistics["index_modification_time"],
+ was_destructive,
index_statistics["page_count"],
index_statistics["domain_count"],
crawl_statistics["redirect_count"],
@@ -103,13 +104,14 @@ def load_last_statistics_from_file(filename):
def deserialize_statistics_line(line):
line_parts = line.split(",")
index_modification_time = line_parts[0]
- page_count = line_parts[1]
- domain_count = line_parts[2]
- redirect_count = line_parts[3]
- redirect_nontrivial_count = line_parts[4]
- broken_url_count = line_parts[5]
- domains = [domain for domain in line_parts[6].split("|")]
- content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[7].split("|")]
+ # discard line_parts[1], which is `was_destructive`
+ page_count = line_parts[2]
+ domain_count = line_parts[3]
+ redirect_count = line_parts[4]
+ redirect_nontrivial_count = line_parts[5]
+ broken_url_count = line_parts[6]
+ domains = [domain for domain in line_parts[7].split("|")]
+ content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[8].split("|")]
return {
"index_modification_time": index_modification_time,