Make incremental build_index work - geminispace.info

commit e4b2ef0192c7d75583f6a417c260585566b5125f
parent a3fef86b23fab26e13a97c281ef55f22cd444a19
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon,  6 Jul 2020 06:22:01 -0400

Make incremental build_index work

Some of the idempotency was lost during the shuffle to split the crawl
into two phases.

Diffstat:
M gus/build_index.py  | 18 +++++++++++++++---
M gus/lib/index_statistics.py  | 16 ++++++++--------

2 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -108,7 +108,9 @@ def create_index(index_dir):
     index_storage.create_index(schema)
 
 
-def index_page(page):
+def index_page(page, indexed_urls):
+    if page.fetchable_url in indexed_urls:
+        return
     print(page.url)
     url = page.url.rstrip("/")
     from_page = Page.alias()
@@ -138,6 +140,15 @@ def index_page(page):
     index_writer.add_document(**document)
 
 
+def load_indexed_urls(index_dir):
+    indexed_urls = []
+    ix = open_dir(index_dir)
+    with ix.reader() as reader:
+        all_stored_fields = reader.all_stored_fields()
+        # TODO: change this (back) to normalized url
+        # indexed_urls = [GeminiResource(f["url"]).normalized_url for f in all_stored_fields]
+        indexed_urls = [f["fetchable_url"] for f in all_stored_fields]
+    return indexed_urls
 
 
 def build_index(should_run_destructive=False):
@@ -154,13 +165,14 @@ def build_index(should_run_destructive=False):
     ix = index_storage.open_index()
     global index_writer
     index_writer = ix.writer()
+    indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
 
     pages = Page.select().where(Page.indexed_at.is_null(False))
     for page in pages.iterator():
-        index_page(page)
+        index_page(page, indexed_urls)
     index_writer.commit()
 
-    index_statistics = compute_index_statistics(index_dir)
+    index_statistics = compute_index_statistics(db)
     print_index_statistics(index_statistics)
     persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv")
     # if should_run_destructive:
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -63,12 +63,12 @@ def print_index_statistics(index_statistics, crawl_statistics=None):
     #     print("- {}".format(domain))
 
     print("\nContent Types:")
-    for pair in index_statistics["content_type_frequencies"]:
-        print("{:>5} - {}".format(pair[1], pair[0]))
+    for entry in index_statistics["content_type_frequencies"]:
+        print("{:>5} - {}".format(entry["count"], entry["content_type"]))
 
     print("\nCharsets:")
-    for pair in index_statistics["charset_frequencies"]:
-        print("{:>5} - {}".format(pair[1], pair[0]))
+    for entry in index_statistics["charset_frequencies"]:
+        print("{:>5} - {}".format(entry["count"], entry["charset"]))
 
 
 def run_index_statistics():
@@ -92,8 +92,8 @@ def serialize_statistics_line(index_statistics, crawl_statistics, was_destructiv
         crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0,
         crawl_statistics["broken_url_count"] if crawl_statistics else 0,
         "|".join(index_statistics["domains"]),
-        "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]),
-        "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["charset_frequencies"]),
+        "|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]),
+        "|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]),
     )
 
 
@@ -124,8 +124,8 @@ def deserialize_statistics_line(line):
     redirect_nontrivial_count = line_parts[5]
     broken_url_count = line_parts[6]
     domains = [domain for domain in line_parts[7].split("|")]
-    content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[8].split("|")]
-    charset_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[9].split("|")]
+    content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")]
+    charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")]
 
     return {
         "index_modification_time": index_modification_time,

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	18	+++++++++++++++---
M	gus/lib/index_statistics.py	\|	16	++++++++--------