geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 962eb179053b0cce08fad5192199ef4a38485d25
parent 5bfa28c579ad10ddc5588109229f6093e1e4f7ef
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 21 Jul 2020 14:49:36 -0400

[build_index] Account for per-page expiration

Diffstat:
Mgus/build_index.py | 18++++++++++++++----
Mgus/lib/index_statistics.py | 24+++++++++++++++++-------
2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -14,6 +14,7 @@ from peewee import ( DateTimeField, DoesNotExist, FloatField, + fn, ForeignKeyField, IntegerField, Model, @@ -28,7 +29,7 @@ from whoosh.query import Every from whoosh.qparser import QueryParser from whoosh.writing import BufferedWriter -from gus.lib.db_model import init_db, Page, Link +from gus.lib.db_model import init_db, Page, Link, Crawl from gus.lib.gemini import GeminiResource, GeminiRobotFileParser from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics from gus.lib.whoosh_extensions import UrlAnalyzer @@ -137,7 +138,7 @@ def index_page(page, indexed_urls): "charset": page.charset or "none", "lang": page.lang, "size": page.size, - "indexed_at": page.indexed_at, + "indexed_at": datetime.strptime(page.crawl_timestamp, "%Y-%m-%d %H:%M:%S.%f"), "backlink_count": backlink_count, "prompt": page.prompt, "content": page.content, @@ -181,8 +182,17 @@ def build_index(should_run_destructive=False, invalidation_window=0): invalidate_recent_results(invalidation_window) indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT) - pages = Page.select().where(Page.indexed_at.is_null(False)) - for page in pages.iterator(): + pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp FROM ( + SELECT crawl.*, row_number() + OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently + FROM crawl) AS c +JOIN page AS p +ON p.id == c.page_id +WHERE c.mostRecently < 3 +AND c.status == 20 +GROUP BY p.id""") + + for page in pages.execute(): index_page(page, indexed_urls) index_writer.commit() diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -6,14 +6,23 @@ from whoosh.index import open_dir from whoosh.qparser import QueryParser from whoosh.query import Every -from gus.lib.db_model import Page, Link +from gus.lib.db_model import Page, Link, Crawl from gus.lib.gemini import GeminiResource def compute_index_statistics(db): - page_count = Page.select().where(Page.indexed_at.is_null(False)).count() + valid_page_ids_query = Page.raw("""SELECT DISTINCT p.id FROM ( + SELECT crawl.*, row_number() + OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently + FROM crawl) AS c +JOIN page AS p +ON p.id == c.page_id +WHERE c.mostRecently < 3 +AND c.status == 20""") + valid_page_ids = [p.id for p in valid_page_ids_query.execute()] + page_count = len(valid_page_ids) domains_query = (Page .select(Page.domain, Page.port) - .where(Page.indexed_at.is_null(False)) + .where(Page.id.in_(valid_page_ids)) .distinct()) domains = [] for d in domains_query: @@ -23,22 +32,23 @@ def compute_index_statistics(db): domains.append(s) domain_count = (Page .select(Page.domain, Page.port) - .where(Page.indexed_at.is_null(False)) + .where(Page.id.in_(valid_page_ids)) .distinct() .count()) content_type_frequencies = (Page .select(Page.content_type, fn.Count(Page.content_type).alias("count")) - .where(Page.indexed_at.is_null(False)) + .where(Page.id.in_(valid_page_ids)) .group_by(Page.content_type) .order_by(SQL('count').desc()) .dicts()) charset_frequencies = (Page .select(Page.charset, fn.Count(Page.charset).alias("count")) - .where(Page.indexed_at.is_null(False), Page.charset.is_null(False)) + .where(Page.id.in_(valid_page_ids), Page.charset.is_null(False)) .group_by(Page.charset) .order_by(SQL('count').desc()) .dicts()) - index_modification_time = Page.select(fn.MAX(Page.indexed_at)).where(Page.indexed_at.is_null(False)).scalar() + index_modification_time = Crawl.select(fn.MAX(Crawl.timestamp)).scalar() + return { "index_modification_time": index_modification_time, "page_count": page_count,