commit a3fef86b23fab26e13a97c281ef55f22cd444a19
parent 613583f8ee6134346ffd8469e6761d98537b4693
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 6 Jul 2020 06:20:01 -0400
DRY up the sqlite model and init_db code
Diffstat:
5 files changed, 54 insertions(+), 91 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -28,9 +28,10 @@ from whoosh.query import Every
from whoosh.qparser import QueryParser
from whoosh.writing import BufferedWriter
+from gus.lib.db_model import init_db, Page, Link
+from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics
from gus.lib.whoosh_extensions import UrlAnalyzer
-from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
# hack(natpen): the built-in methods in urllib need to know the
# Gemini protocol exists
@@ -137,43 +138,6 @@ def index_page(page):
index_writer.add_document(**document)
-class Page(Model):
- """
- All the pages crawled in Geminispace
- """
-
- url = TextField(unique=True, index=True)
- fetchable_url = TextField(null=True)
- domain = TextField(null=True)
- port = IntegerField(null=True)
- content_type = TextField(null=True)
- charset = TextField(null=True)
- # TODO: normalize lang out to handle multiple values better
- lang = TextField(null=True)
- content = TextField(null=True)
- prompt = TextField(null=True)
- size = IntegerField(null=True) # in bytes
- indexed_at = DateTimeField(null=True)
-
-
-class Link(Model):
- """
- Hyperlinks between pages in Geminispace
- """
-
- from_page = ForeignKeyField(Page, backref="outbound_links")
- to_page = ForeignKeyField(Page, backref="backlinks")
-
-
-def init_db(filename=":memory:"):
- """
- Bind an SQLite database to the Peewee ORM models.
- """
- models = [Page, Link]
- db = SqliteDatabase(filename)
- db.bind(models)
- db.create_tables(models)
- return db
def build_index(should_run_destructive=False):
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -21,6 +21,7 @@ from peewee import (
TextField,
)
+from gus.lib.db_model import init_db, Page, Link
from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
# hack(natpen): the built-in methods in urllib need to know the
@@ -403,45 +404,6 @@ def load_seed_request_urls():
return content
-class Page(Model):
- """
- All the pages crawled in Geminispace
- """
-
- url = TextField(unique=True, index=True)
- fetchable_url = TextField(null=True)
- domain = TextField(null=True)
- port = IntegerField(null=True)
- content_type = TextField(null=True)
- charset = TextField(null=True)
- # TODO: normalize lang out to handle multiple values better
- lang = TextField(null=True)
- content = TextField(null=True)
- prompt = TextField(null=True)
- size = IntegerField(null=True) # in bytes
- indexed_at = DateTimeField(null=True)
-
-
-class Link(Model):
- """
- Hyperlinks between pages in Geminispace
- """
-
- from_page = ForeignKeyField(Page, backref="outbound_links")
- to_page = ForeignKeyField(Page, backref="backlinks")
-
-
-def init_db(filename=":memory:"):
- """
- Bind an SQLite database to the Peewee ORM models.
- """
- models = [Page, Link]
- db = SqliteDatabase(filename)
- db.bind(models)
- db.create_tables(models)
- return db
-
-
def run_crawl(should_run_destructive=False, seed_urls=[]):
# TODO: track failed domain/page attempts, and don't reattempt for 15mins
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -0,0 +1,48 @@
+from peewee import (
+ BooleanField,
+ DateTimeField,
+ DoesNotExist,
+ FloatField,
+ ForeignKeyField,
+ IntegerField,
+ Model,
+ SqliteDatabase,
+ TextField,
+)
+
+def init_db(filename=":memory:"):
+ """
+ Bind an SQLite database to the Peewee ORM models.
+ """
+ models = [Page, Link]
+ db = SqliteDatabase(filename)
+ db.bind(models)
+ db.create_tables(models)
+ return db
+
+
+class Page(Model):
+ """
+ All the pages crawled in Geminispace
+ """
+
+ url = TextField(unique=True, index=True)
+ fetchable_url = TextField(null=True)
+ domain = TextField(null=True)
+ port = IntegerField(null=True)
+ content_type = TextField(null=True)
+ charset = TextField(null=True)
+ # TODO: normalize lang out to handle multiple values better
+ lang = TextField(null=True)
+ content = TextField(null=True)
+ prompt = TextField(null=True)
+ size = IntegerField(null=True) # in bytes
+ indexed_at = DateTimeField(null=True)
+
+class Link(Model):
+ """
+ Hyperlinks between pages in Geminispace
+ """
+
+ from_page = ForeignKeyField(Page, backref="outbound_links")
+ to_page = ForeignKeyField(Page, backref="backlinks")
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -6,7 +6,7 @@ from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from whoosh.query import Every
-from gus.crawl import Page, Link
+from gus.lib.db_model import Page, Link
from gus.lib.gemini import GeminiResource
def compute_index_statistics(db):
diff --git a/serve/models.py b/serve/models.py
@@ -6,7 +6,7 @@ from whoosh import highlight, qparser
from whoosh.index import open_dir
from . import constants
-from gus.crawl import Page, Link
+from gus.lib.db_model import init_db, Page, Link
from gus.lib.gemini import GeminiResource
from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file
from gus.lib.misc import bytes2human
@@ -24,22 +24,11 @@ class GUS():
order=highlight.SCORE,
)
- self.db = GUS.init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}")
+ self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}")
self.statistics = compute_index_statistics(self.db)
self.statistics_historical_overall = load_all_statistics_from_file(constants.STATISTICS_FILE)
- def init_db(filename = ":memory:"):
- """
- Bind an SQLite database to the Peewee ORM models.
- """
- models = [Page, Link]
- db = SqliteDatabase(filename)
- db.bind(models)
- db.create_tables(models)
- return db
-
-
def init_query_parser(ix):
or_group = qparser.OrGroup.factory(0.99)
query_parser = qparser.MultifieldParser(["content", "url", "prompt"], ix.schema, group=or_group)