geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit a8879dd5d3b05f96e5fdae17a3551d5bccd5aa76
parent 739328b9e5aae6855f1cc92def9872dd5ba50741
Author: Rene Wagner <rwa@clttr.info>
Date:   Wed,  8 Mar 2023 20:31:57 +0000

split content into separate table

Diffstat:
Mgus/build_index.py | 8+++++---
Mgus/crawl.py | 27++++++++++++++++++++++++---
Mgus/lib/db_model.py | 13+++++++++----
3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -8,7 +8,7 @@ from peewee import fn from . import constants from gus.crawl import should_skip from gus.excludes import EXCLUDED_URL_PREFIXES -from gus.lib.db_model import init_db, Page +from gus.lib.db_model import init_db, Page, PageContent from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import ( compute_index_statistics, @@ -50,6 +50,8 @@ AND l.is_cross_host_like == 1""", backlink_urls = [b.url for b in external_backlinks.execute()] backlink_count = len(backlink_urls) + pagecontent = PageContent.get_or_none(page_id = page.id) + document = { "url_id": page.url, "url": page.url, @@ -62,8 +64,8 @@ AND l.is_cross_host_like == 1""", "size": page.size, "indexed_at": datetime.utcnow(), "backlink_count": backlink_count, - "prompt": page.prompt, - "content": page.content, + "prompt": pagecontent.prompt, + "content": pagecontent.content } try: logging.debug("Adding document to index: %s", page.url); diff --git a/gus/crawl.py b/gus/crawl.py @@ -14,7 +14,7 @@ import peewee from gus.excludes import EXCLUDED_URL_PREFIXES, EXCLUDED_URL_PATHS from . import constants -from gus.lib.db_model import init_db, Page, Link +from gus.lib.db_model import init_db, Page, PageContent, Link from gus.lib.gemini import GeminiResource, GeminiRobotFileParser import gus.lib.logging from gus.lib.logging import strip_control_chars @@ -164,7 +164,6 @@ def index_prompt(resource, response): "content_type": "input", "charset": response.charset, "size": response.num_bytes, - "prompt": response.prompt, "change_frequency": resource.get_default_change_frequency("prompt"), "last_crawl_at": datetime.utcnow(), "last_crawl_success_at": datetime.utcnow(), @@ -189,6 +188,17 @@ def index_prompt(resource, response): page = Page(**doc) try: page.save() + content = { + "page_id": page.id, + "prompt": response.prompt, + "content": None + } + existing_pagecontent = PageContent.get_or_none(page_id=page.id) + if existing_pagecontent: + content["id"] = existing_pagecontent.id + + pagecontent = PageContent(**content) + pagecontent.save() except: logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) @@ -208,7 +218,6 @@ def index_content(resource, response): "port": resource.urlsplit.port or 1965, "content_type": response.content_type, "charset": response.charset, - "content": response.content if response.num_bytes <= constants.MAXIMUM_TEXT_PAGE_SIZE else None, "size": response.num_bytes, "change_frequency": resource.get_default_change_frequency("content"), "last_crawl_at": datetime.utcnow(), @@ -243,6 +252,18 @@ def index_content(resource, response): page = Page(**doc) try: page.save() + if response.num_bytes <= constants.MAXIMUM_TEXT_PAGE_SIZE: + content = { + "page_id": page.id, + "prompt": None, + "content": response.content + } + existing_pagecontent = PageContent.get_or_none(page_id=page.id) + if existing_pagecontent: + content["id"] = existing_pagecontent.id + + pagecontent = PageContent(**content) + pagecontent.save() except: logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -17,7 +17,7 @@ def init_db(filename=":memory:"): """ Bind an SQLite database to the Peewee ORM models. """ - models = [Link, Page] + models = [Link, Page, PageContent] db = SqliteDatabase(filename, pragmas={ 'journal_mode': 'wal', 'cache_size': -128 * 1000, @@ -30,7 +30,7 @@ def init_db(filename=":memory:"): class Page(Model): """ - All the pages crawled in Geminispace + Metadata of all the pages """ url = TextField(unique=True, index=True) @@ -41,8 +41,6 @@ class Page(Model): charset = TextField(null=True) # TODO: normalize lang out to handle multiple values better lang = TextField(null=True) - content = TextField(null=True) - prompt = TextField(null=True) size = IntegerField(null=True) # in bytes change_frequency = IntegerField(null=True) # in hours indexed_at = DateTimeField(null=True) @@ -58,6 +56,13 @@ class Page(Model): (('last_crawl_at', 'last_crawl_success_at'), False) ) +class PageContent(Model): + """ + Content of all pages + """ + page = ForeignKeyField(Page, backref="page_content", on_delete="CASCADE") + content = TextField(null=True) + prompt = TextField(null=True) class Link(Model): """