geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 96b7b661fb2c5f5eed02e4475467ef61a1b4c251
parent 4cc4e66b6c01fe54d9e5992f7f6a2147d53eba2b
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed, 22 Jul 2020 13:29:00 -0400

Add normalized url to db

Diffstat:
Mgus/build_index.py | 33++++++++++++++++-----------------
Mgus/crawl.py | 11++++++++---
Mgus/lib/db_model.py | 3++-
Mgus/lib/index_statistics.py | 10++++------
Mserve/models.py | 25++++++++++++++-----------
5 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -116,16 +116,19 @@ def index_page(page, indexed_urls): if page.fetchable_url in indexed_urls: return print(page.url) - url = page.url.rstrip("/") - from_page = Page.alias() - to_page = Page.alias() - backlinks = (Link - .select(from_page) - .join(from_page, on=(from_page.id == Link.from_page_id)) - .join(to_page, on=(to_page.id == Link.to_page_id)) - .where(to_page.url << [url, f"{url}/"]) - .dicts()) - backlink_urls = [b["url"] for b in backlinks] + u = page.url.rstrip("/") + backlinks = Page.raw("""SELECT p_from.url +FROM page AS p_from +JOIN indexable_crawl AS ic +ON ic.page_id == p_from.id +JOIN link as l +ON l.from_page_id == p_from.id +JOIN page as p_to +ON p_to.id == l.to_page_id +WHERE p_to.url IN (?, ?) +GROUP BY p_from.normalized_url""", u, f"{u}/") + + backlink_urls = [b.url for b in backlinks.execute()] backlink_count = len(list(set(backlink_urls))) document = { @@ -182,15 +185,11 @@ def build_index(should_run_destructive=False, invalidation_window=0): invalidate_recent_results(invalidation_window) indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT) - pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp FROM ( - SELECT crawl.*, row_number() - OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently - FROM crawl) AS c + pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp +FROM indexable_crawl AS c JOIN page AS p ON p.id == c.page_id -WHERE c.mostRecently < 3 -AND c.status == 20 -GROUP BY p.id""") +GROUP BY p.normalized_url""") for page in pages.execute(): index_page(page, indexed_urls) diff --git a/gus/crawl.py b/gus/crawl.py @@ -147,6 +147,7 @@ def index_binary(resource, response): doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, + "normalized_url": resource.normalized_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "content_type": response.content_type, @@ -168,6 +169,7 @@ def index_redirect(resource): doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, + "normalized_url": resource.normalized_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "change_frequency": constants.DEFAULT_REDIRECT_CHANGE_FREQUENCY, @@ -186,6 +188,7 @@ def index_error(resource, is_temporary): doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, + "normalized_url": resource.normalized_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "change_frequency": constants.DEFAULT_TEMP_ERROR_CHANGE_FREQUENCY if is_temporary else constants.DEFAULT_PERM_ERROR_CHANGE_FREQUENCY, @@ -205,6 +208,7 @@ def index_prompt(resource, response): doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, + "normalized_url": resource.normalized_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "content_type": "input", @@ -228,6 +232,7 @@ def index_content(resource, response): doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, + "normalized_url": resource.normalized_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "content_type": response.content_type, @@ -462,14 +467,14 @@ def unpickle_robot_file_map(index_dir): def load_expired_urls(): expired_pages = Page.raw("""SELECT url FROM ( - SELECT p.url, p.change_frequency, MAX(c.timestamp) as timestamp + SELECT p.url, p.normalized_url, p.change_frequency, MAX(c.timestamp) as timestamp FROM page as p JOIN crawl as c ON p.id == c.page_id GROUP BY p.url ) -WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now'); -""") +WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') +GROUP BY normalized_url;""") return [page.url for page in expired_pages.execute()] diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -28,6 +28,7 @@ class Page(Model): url = TextField(unique=True, index=True) fetchable_url = TextField(null=True) + normalized_url = TextField(null=True) domain = TextField(null=True) port = IntegerField(null=True) content_type = TextField(null=True) @@ -54,6 +55,6 @@ class Crawl(Model): page = ForeignKeyField(Page, backref="crawls", on_delete='CASCADE') status = IntegerField() - error_message = TextField() + error_message = TextField(null=True) is_different = BooleanField() timestamp = DateTimeField() diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -10,14 +10,12 @@ from gus.lib.db_model import Page, Link, Crawl from gus.lib.gemini import GeminiResource def compute_index_statistics(db): - valid_page_ids_query = Page.raw("""SELECT DISTINCT p.id FROM ( - SELECT crawl.*, row_number() - OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS mostRecently - FROM crawl) AS c + valid_page_ids_query = Page.raw("""SELECT p.id +FROM indexable_crawl AS c JOIN page AS p ON p.id == c.page_id -WHERE c.mostRecently < 3 -AND c.status == 20""") +GROUP BY p.normalized_url""") + valid_page_ids = [p.id for p in valid_page_ids_query.execute()] page_count = len(valid_page_ids) domains_query = (Page diff --git a/serve/models.py b/serve/models.py @@ -1,12 +1,12 @@ import re from urllib.parse import quote -from peewee import SqliteDatabase +from peewee import fn, SqliteDatabase from whoosh import highlight, qparser from whoosh.index import open_dir from . import constants -from gus.lib.db_model import init_db, Page, Link +from gus.lib.db_model import init_db, Page, Link, Crawl from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file from gus.lib.misc import bytes2human @@ -67,15 +67,18 @@ class GUS(): return [] u = resource.indexable_url.rstrip("/") - from_page = Page.alias() - to_page = Page.alias() - backlinks = (Link - .select(from_page) - .join(from_page, on=(from_page.id == Link.from_page_id)) - .join(to_page, on=(to_page.id == Link.to_page_id)) - .where(to_page.url << [u, f"{u}/"]) - .dicts()) - backlink_urls = [b["url"] for b in backlinks] + backlinks = Page.raw("""SELECT p_from.url +FROM page AS p_from +JOIN indexable_crawl AS ic +ON ic.page_id == p_from.id +JOIN link as l +ON l.from_page_id == p_from.id +JOIN page as p_to +ON p_to.id == l.to_page_id +WHERE p_to.url IN (?, ?) +GROUP BY p_from.normalized_url""", u, f"{u}/") + + backlink_urls = [b.url for b in backlinks.execute()] return list(set(backlink_urls))