commit e4b2ef0192c7d75583f6a417c260585566b5125f
parent a3fef86b23fab26e13a97c281ef55f22cd444a19
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 6 Jul 2020 06:22:01 -0400
Make incremental build_index work
Some of the idempotency was lost during the shuffle to split the crawl
into two phases.
Diffstat:
2 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -108,7 +108,9 @@ def create_index(index_dir):
index_storage.create_index(schema)
-def index_page(page):
+def index_page(page, indexed_urls):
+ if page.fetchable_url in indexed_urls:
+ return
print(page.url)
url = page.url.rstrip("/")
from_page = Page.alias()
@@ -138,6 +140,15 @@ def index_page(page):
index_writer.add_document(**document)
+def load_indexed_urls(index_dir):
+ indexed_urls = []
+ ix = open_dir(index_dir)
+ with ix.reader() as reader:
+ all_stored_fields = reader.all_stored_fields()
+ # TODO: change this (back) to normalized url
+ # indexed_urls = [GeminiResource(f["url"]).normalized_url for f in all_stored_fields]
+ indexed_urls = [f["fetchable_url"] for f in all_stored_fields]
+ return indexed_urls
def build_index(should_run_destructive=False):
@@ -154,13 +165,14 @@ def build_index(should_run_destructive=False):
ix = index_storage.open_index()
global index_writer
index_writer = ix.writer()
+ indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
pages = Page.select().where(Page.indexed_at.is_null(False))
for page in pages.iterator():
- index_page(page)
+ index_page(page, indexed_urls)
index_writer.commit()
- index_statistics = compute_index_statistics(index_dir)
+ index_statistics = compute_index_statistics(db)
print_index_statistics(index_statistics)
persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv")
# if should_run_destructive:
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -63,12 +63,12 @@ def print_index_statistics(index_statistics, crawl_statistics=None):
# print("- {}".format(domain))
print("\nContent Types:")
- for pair in index_statistics["content_type_frequencies"]:
- print("{:>5} - {}".format(pair[1], pair[0]))
+ for entry in index_statistics["content_type_frequencies"]:
+ print("{:>5} - {}".format(entry["count"], entry["content_type"]))
print("\nCharsets:")
- for pair in index_statistics["charset_frequencies"]:
- print("{:>5} - {}".format(pair[1], pair[0]))
+ for entry in index_statistics["charset_frequencies"]:
+ print("{:>5} - {}".format(entry["count"], entry["charset"]))
def run_index_statistics():
@@ -92,8 +92,8 @@ def serialize_statistics_line(index_statistics, crawl_statistics, was_destructiv
crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0,
crawl_statistics["broken_url_count"] if crawl_statistics else 0,
"|".join(index_statistics["domains"]),
- "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]),
- "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["charset_frequencies"]),
+ "|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]),
+ "|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]),
)
@@ -124,8 +124,8 @@ def deserialize_statistics_line(line):
redirect_nontrivial_count = line_parts[5]
broken_url_count = line_parts[6]
domains = [domain for domain in line_parts[7].split("|")]
- content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[8].split("|")]
- charset_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[9].split("|")]
+ content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")]
+ charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")]
return {
"index_modification_time": index_modification_time,