commit a8879dd5d3b05f96e5fdae17a3551d5bccd5aa76
parent 739328b9e5aae6855f1cc92def9872dd5ba50741
Author: Rene Wagner <rwa@clttr.info>
Date: Wed, 8 Mar 2023 20:31:57 +0000
split content into separate table
Diffstat:
3 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -8,7 +8,7 @@ from peewee import fn
from . import constants
from gus.crawl import should_skip
from gus.excludes import EXCLUDED_URL_PREFIXES
-from gus.lib.db_model import init_db, Page
+from gus.lib.db_model import init_db, Page, PageContent
from gus.lib.gemini import GeminiResource
from gus.lib.index_statistics import (
compute_index_statistics,
@@ -50,6 +50,8 @@ AND l.is_cross_host_like == 1""",
backlink_urls = [b.url for b in external_backlinks.execute()]
backlink_count = len(backlink_urls)
+ pagecontent = PageContent.get_or_none(page_id = page.id)
+
document = {
"url_id": page.url,
"url": page.url,
@@ -62,8 +64,8 @@ AND l.is_cross_host_like == 1""",
"size": page.size,
"indexed_at": datetime.utcnow(),
"backlink_count": backlink_count,
- "prompt": page.prompt,
- "content": page.content,
+ "prompt": pagecontent.prompt,
+ "content": pagecontent.content
}
try:
logging.debug("Adding document to index: %s", page.url);
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -14,7 +14,7 @@ import peewee
from gus.excludes import EXCLUDED_URL_PREFIXES, EXCLUDED_URL_PATHS
from . import constants
-from gus.lib.db_model import init_db, Page, Link
+from gus.lib.db_model import init_db, Page, PageContent, Link
from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
import gus.lib.logging
from gus.lib.logging import strip_control_chars
@@ -164,7 +164,6 @@ def index_prompt(resource, response):
"content_type": "input",
"charset": response.charset,
"size": response.num_bytes,
- "prompt": response.prompt,
"change_frequency": resource.get_default_change_frequency("prompt"),
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
@@ -189,6 +188,17 @@ def index_prompt(resource, response):
page = Page(**doc)
try:
page.save()
+ content = {
+ "page_id": page.id,
+ "prompt": response.prompt,
+ "content": None
+ }
+ existing_pagecontent = PageContent.get_or_none(page_id=page.id)
+ if existing_pagecontent:
+ content["id"] = existing_pagecontent.id
+
+ pagecontent = PageContent(**content)
+ pagecontent.save()
except:
logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
@@ -208,7 +218,6 @@ def index_content(resource, response):
"port": resource.urlsplit.port or 1965,
"content_type": response.content_type,
"charset": response.charset,
- "content": response.content if response.num_bytes <= constants.MAXIMUM_TEXT_PAGE_SIZE else None,
"size": response.num_bytes,
"change_frequency": resource.get_default_change_frequency("content"),
"last_crawl_at": datetime.utcnow(),
@@ -243,6 +252,18 @@ def index_content(resource, response):
page = Page(**doc)
try:
page.save()
+ if response.num_bytes <= constants.MAXIMUM_TEXT_PAGE_SIZE:
+ content = {
+ "page_id": page.id,
+ "prompt": None,
+ "content": response.content
+ }
+ existing_pagecontent = PageContent.get_or_none(page_id=page.id)
+ if existing_pagecontent:
+ content["id"] = existing_pagecontent.id
+
+ pagecontent = PageContent(**content)
+ pagecontent.save()
except:
logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -17,7 +17,7 @@ def init_db(filename=":memory:"):
"""
Bind an SQLite database to the Peewee ORM models.
"""
- models = [Link, Page]
+ models = [Link, Page, PageContent]
db = SqliteDatabase(filename, pragmas={
'journal_mode': 'wal',
'cache_size': -128 * 1000,
@@ -30,7 +30,7 @@ def init_db(filename=":memory:"):
class Page(Model):
"""
- All the pages crawled in Geminispace
+ Metadata of all the pages
"""
url = TextField(unique=True, index=True)
@@ -41,8 +41,6 @@ class Page(Model):
charset = TextField(null=True)
# TODO: normalize lang out to handle multiple values better
lang = TextField(null=True)
- content = TextField(null=True)
- prompt = TextField(null=True)
size = IntegerField(null=True) # in bytes
change_frequency = IntegerField(null=True) # in hours
indexed_at = DateTimeField(null=True)
@@ -58,6 +56,13 @@ class Page(Model):
(('last_crawl_at', 'last_crawl_success_at'), False)
)
+class PageContent(Model):
+ """
+ Content of all pages
+ """
+ page = ForeignKeyField(Page, backref="page_content", on_delete="CASCADE")
+ content = TextField(null=True)
+ prompt = TextField(null=True)
class Link(Model):
"""