[crawl] Persist index & crawl statistics on non-destructive crawls - geminispace.info

commit bfcfec84e04a54c1c72df6858512ad6a238ec2d0
parent 682feb199142c56576c21692108828efbbc43879
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed,  3 Jun 2020 10:58:45 -0400

[crawl] Persist index & crawl statistics on non-destructive crawls

Also, add a flag to track which serialized statistics lines originated
from incremental crawls.

Diffstat:
M gus/crawl.py  | 2 +-
M gus/lib/index_statistics.py  | 24 +++++++++++++-----------

2 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -393,8 +393,8 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
 
     index_statistics = compute_index_statistics(index_dir)
     print_index_statistics(index_statistics, crawl_statistics)
+    persist_statistics(index_statistics, crawl_statistics, should_run_destructive, "statistics.csv")
     if should_run_destructive:
-        persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
         # replace current index with new index
         shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True)
         shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT)
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -74,14 +74,15 @@ def run_index_statistics():
     # persist_index_statistics(index_statistics, "index-statistics.csv")
 
 
-def persist_statistics(index_statistics, crawl_statistics, filename):
+def persist_statistics(index_statistics, crawl_statistics, was_destructive, filename):
     with open(filename, "a") as f:
-        f.write(serialize_statistics_line(index_statistics, crawl_statistics))
+        f.write(serialize_statistics_line(index_statistics, crawl_statistics, was_destructive))
 
 
-def serialize_statistics_line(index_statistics, crawl_statistics):
-    return "{:%Y-%m-%d},{},{},{},{},{},{},{}\n".format(
+def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive):
+    return "{:%Y-%m-%d},{},{},{},{},{},{},{},{}\n".format(
         index_statistics["index_modification_time"],
+        was_destructive,
         index_statistics["page_count"],
         index_statistics["domain_count"],
         crawl_statistics["redirect_count"],
@@ -103,13 +104,14 @@ def load_last_statistics_from_file(filename):
 def deserialize_statistics_line(line):
     line_parts = line.split(",")
     index_modification_time = line_parts[0]
-    page_count = line_parts[1]
-    domain_count = line_parts[2]
-    redirect_count = line_parts[3]
-    redirect_nontrivial_count = line_parts[4]
-    broken_url_count = line_parts[5]
-    domains = [domain for domain in line_parts[6].split("|")]
-    content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[7].split("|")]
+    # discard line_parts[1], which is `was_destructive`
+    page_count = line_parts[2]
+    domain_count = line_parts[3]
+    redirect_count = line_parts[4]
+    redirect_nontrivial_count = line_parts[5]
+    broken_url_count = line_parts[6]
+    domains = [domain for domain in line_parts[7].split("|")]
+    content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[8].split("|")]
 
     return {
         "index_modification_time": index_modification_time,

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	2	+-
M	gus/lib/index_statistics.py	\|	24	+++++++++++++-----------