[build_index] Account for per-page expiration - geminispace.info

commit 962eb179053b0cce08fad5192199ef4a38485d25
parent 5bfa28c579ad10ddc5588109229f6093e1e4f7ef
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 21 Jul 2020 14:49:36 -0400

[build_index] Account for per-page expiration

Diffstat:
M gus/build_index.py  | 18 ++++++++++++++----
M gus/lib/index_statistics.py  | 24 +++++++++++++++++-------

2 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -14,6 +14,7 @@ from peewee import (
     DateTimeField,
     DoesNotExist,
     FloatField,
+    fn,
     ForeignKeyField,
     IntegerField,
     Model,
@@ -28,7 +29,7 @@ from whoosh.query import Every
 from whoosh.qparser import QueryParser
 from whoosh.writing import BufferedWriter
 
-from gus.lib.db_model import init_db, Page, Link
+from gus.lib.db_model import init_db, Page, Link, Crawl
 from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
 from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics
 from gus.lib.whoosh_extensions import UrlAnalyzer
@@ -137,7 +138,7 @@ def index_page(page, indexed_urls):
         "charset": page.charset or "none",
         "lang": page.lang,
         "size": page.size,
-        "indexed_at": page.indexed_at,
+        "indexed_at": datetime.strptime(page.crawl_timestamp, "%Y-%m-%d %H:%M:%S.%f"),
         "backlink_count": backlink_count,
         "prompt": page.prompt,
         "content": page.content,
@@ -181,8 +182,17 @@ def build_index(should_run_destructive=False, invalidation_window=0):
     invalidate_recent_results(invalidation_window)
     indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
 
-    pages = Page.select().where(Page.indexed_at.is_null(False))
-    for page in pages.iterator():
+    pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp FROM (
+  SELECT crawl.*, row_number()
+  OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently
+  FROM crawl) AS c
+JOIN page AS p
+ON p.id == c.page_id
+WHERE c.mostRecently < 3
+AND c.status == 20
+GROUP BY p.id""")
+
+    for page in pages.execute():
         index_page(page, indexed_urls)
     index_writer.commit()
 
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -6,14 +6,23 @@ from whoosh.index import open_dir
 from whoosh.qparser import QueryParser
 from whoosh.query import Every
 
-from gus.lib.db_model import Page, Link
+from gus.lib.db_model import Page, Link, Crawl
 from gus.lib.gemini import GeminiResource
 
 def compute_index_statistics(db):
-    page_count = Page.select().where(Page.indexed_at.is_null(False)).count()
+    valid_page_ids_query = Page.raw("""SELECT DISTINCT p.id FROM (
+  SELECT crawl.*, row_number()
+  OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently
+  FROM crawl) AS c
+JOIN page AS p
+ON p.id == c.page_id
+WHERE c.mostRecently < 3
+AND c.status == 20""")
+    valid_page_ids = [p.id for p in valid_page_ids_query.execute()]
+    page_count = len(valid_page_ids)
     domains_query = (Page
                .select(Page.domain, Page.port)
-               .where(Page.indexed_at.is_null(False))
+               .where(Page.id.in_(valid_page_ids))
                .distinct())
     domains = []
     for d in domains_query:
@@ -23,22 +32,23 @@ def compute_index_statistics(db):
         domains.append(s)
     domain_count = (Page
                     .select(Page.domain, Page.port)
-                    .where(Page.indexed_at.is_null(False))
+                    .where(Page.id.in_(valid_page_ids))
                     .distinct()
                     .count())
     content_type_frequencies = (Page
                                 .select(Page.content_type, fn.Count(Page.content_type).alias("count"))
-                                .where(Page.indexed_at.is_null(False))
+                                .where(Page.id.in_(valid_page_ids))
                                 .group_by(Page.content_type)
                                 .order_by(SQL('count').desc())
                                 .dicts())
     charset_frequencies = (Page
                                 .select(Page.charset, fn.Count(Page.charset).alias("count"))
-                                .where(Page.indexed_at.is_null(False), Page.charset.is_null(False))
+                                .where(Page.id.in_(valid_page_ids), Page.charset.is_null(False))
                                 .group_by(Page.charset)
                                 .order_by(SQL('count').desc())
                                 .dicts())
-    index_modification_time = Page.select(fn.MAX(Page.indexed_at)).where(Page.indexed_at.is_null(False)).scalar()
+    index_modification_time = Crawl.select(fn.MAX(Crawl.timestamp)).scalar()
+
     return {
         "index_modification_time": index_modification_time,
         "page_count": page_count,

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	18	++++++++++++++----
M	gus/lib/index_statistics.py	\|	24	+++++++++++++++++-------