commit 962eb179053b0cce08fad5192199ef4a38485d25
parent 5bfa28c579ad10ddc5588109229f6093e1e4f7ef
Author: Natalie Pendragon <natpen@natpen.net>
Date: Tue, 21 Jul 2020 14:49:36 -0400
[build_index] Account for per-page expiration
Diffstat:
2 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -14,6 +14,7 @@ from peewee import (
DateTimeField,
DoesNotExist,
FloatField,
+ fn,
ForeignKeyField,
IntegerField,
Model,
@@ -28,7 +29,7 @@ from whoosh.query import Every
from whoosh.qparser import QueryParser
from whoosh.writing import BufferedWriter
-from gus.lib.db_model import init_db, Page, Link
+from gus.lib.db_model import init_db, Page, Link, Crawl
from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics
from gus.lib.whoosh_extensions import UrlAnalyzer
@@ -137,7 +138,7 @@ def index_page(page, indexed_urls):
"charset": page.charset or "none",
"lang": page.lang,
"size": page.size,
- "indexed_at": page.indexed_at,
+ "indexed_at": datetime.strptime(page.crawl_timestamp, "%Y-%m-%d %H:%M:%S.%f"),
"backlink_count": backlink_count,
"prompt": page.prompt,
"content": page.content,
@@ -181,8 +182,17 @@ def build_index(should_run_destructive=False, invalidation_window=0):
invalidate_recent_results(invalidation_window)
indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
- pages = Page.select().where(Page.indexed_at.is_null(False))
- for page in pages.iterator():
+ pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp FROM (
+ SELECT crawl.*, row_number()
+ OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently
+ FROM crawl) AS c
+JOIN page AS p
+ON p.id == c.page_id
+WHERE c.mostRecently < 3
+AND c.status == 20
+GROUP BY p.id""")
+
+ for page in pages.execute():
index_page(page, indexed_urls)
index_writer.commit()
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -6,14 +6,23 @@ from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from whoosh.query import Every
-from gus.lib.db_model import Page, Link
+from gus.lib.db_model import Page, Link, Crawl
from gus.lib.gemini import GeminiResource
def compute_index_statistics(db):
- page_count = Page.select().where(Page.indexed_at.is_null(False)).count()
+ valid_page_ids_query = Page.raw("""SELECT DISTINCT p.id FROM (
+ SELECT crawl.*, row_number()
+ OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently
+ FROM crawl) AS c
+JOIN page AS p
+ON p.id == c.page_id
+WHERE c.mostRecently < 3
+AND c.status == 20""")
+ valid_page_ids = [p.id for p in valid_page_ids_query.execute()]
+ page_count = len(valid_page_ids)
domains_query = (Page
.select(Page.domain, Page.port)
- .where(Page.indexed_at.is_null(False))
+ .where(Page.id.in_(valid_page_ids))
.distinct())
domains = []
for d in domains_query:
@@ -23,22 +32,23 @@ def compute_index_statistics(db):
domains.append(s)
domain_count = (Page
.select(Page.domain, Page.port)
- .where(Page.indexed_at.is_null(False))
+ .where(Page.id.in_(valid_page_ids))
.distinct()
.count())
content_type_frequencies = (Page
.select(Page.content_type, fn.Count(Page.content_type).alias("count"))
- .where(Page.indexed_at.is_null(False))
+ .where(Page.id.in_(valid_page_ids))
.group_by(Page.content_type)
.order_by(SQL('count').desc())
.dicts())
charset_frequencies = (Page
.select(Page.charset, fn.Count(Page.charset).alias("count"))
- .where(Page.indexed_at.is_null(False), Page.charset.is_null(False))
+ .where(Page.id.in_(valid_page_ids), Page.charset.is_null(False))
.group_by(Page.charset)
.order_by(SQL('count').desc())
.dicts())
- index_modification_time = Page.select(fn.MAX(Page.indexed_at)).where(Page.indexed_at.is_null(False)).scalar()
+ index_modification_time = Crawl.select(fn.MAX(Crawl.timestamp)).scalar()
+
return {
"index_modification_time": index_modification_time,
"page_count": page_count,