geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 823e8d6a2c764cd58aa77b69b3bb2bc9da5720ca
parent 015279c14108635a1f7639798d19589537ed5a42
Author: René Wagner <rwa@clttr.info>
Date:   Sat,  5 Feb 2022 10:37:07 +0100

precompute hosts statistics

relates to #42

Diffstat:
Mserve/models.py | 52++++++++++++++++++++++++----------------------------
Mserve/views.py | 4++--
2 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/serve/models.py b/serve/models.py @@ -20,8 +20,31 @@ class GUS: self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}") self.statistics = compute_index_statistics(self.db) self.statistics_historical_overall = load_all_statistics_from_file( - constants.STATISTICS_FILE + constants.STATISTICS_FILE) + hosts_query = Page.raw( + """ + SELECT DISTINCT p.domain + FROM page AS p + WHERE last_crawl_success_at IS NOT NULL + AND last_status = 20 + ORDER BY p.domain + """ + ) + self.hosts = hosts_query.execute() + newest_hosts_query = Page.raw( + """ + SELECT p.domain, p.first_seen_at + FROM page AS p + WHERE last_crawl_success_at IS NOT NULL + AND first_seen_at IS NOT NULL + AND last_status = 20 + GROUP BY p.domain + ORDER BY first_seen_at DESC + LIMIT 30 + """ ) + + self.newest_hosts = newest_hosts_query.execute() def search_index(self, query, requested_page): query = self.index.parse_query(query) @@ -98,33 +121,6 @@ AND p.last_crawl_success_at IS NOT NULL""" ) return feeds_query.execute() - def get_hosts(self): - hosts_query = Page.raw( - """ - SELECT DISTINCT p.domain - FROM page AS p - WHERE last_crawl_success_at IS NOT NULL - AND last_status = 20 - ORDER BY p.domain - """ - ) - return hosts_query.execute() - - def get_newest_hosts(self): - newest_hosts_query = Page.raw( - """ - SELECT p.domain, p.first_seen_at - FROM page AS p - WHERE last_crawl_success_at IS NOT NULL - AND first_seen_at IS NOT NULL - AND last_status = 20 - GROUP BY p.domain - ORDER BY first_seen_at DESC - LIMIT 30 - """ - ) - return newest_hosts_query.execute() - def get_newest_pages(self): newest_pages_query = Page.raw( """SELECT p.url, p.fetchable_url, p.first_seen_at FROM page as p diff --git a/serve/views.py b/serve/views.py @@ -109,7 +109,7 @@ def statistics(request): def known_hosts(request): body = render_template( "known_hosts.gmi", - known_hosts=gus.get_hosts(), + known_hosts=gus.hosts, index_modification_time=gus.statistics["index_modification_time"] ) return Response(Status.SUCCESS, "text/gemini", body) @@ -119,7 +119,7 @@ def known_hosts(request): def newest_hosts(request): body = render_template( "newest_hosts.gmi", - newest_hosts=gus.get_newest_hosts(), + newest_hosts=gus.newest_hosts, index_modification_time=gus.statistics["index_modification_time"] ) return Response(Status.SUCCESS, "text/gemini", body)