geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit a3fef86b23fab26e13a97c281ef55f22cd444a19
parent 613583f8ee6134346ffd8469e6761d98537b4693
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon,  6 Jul 2020 06:20:01 -0400

DRY up the sqlite model and init_db code

Diffstat:
Mgus/build_index.py | 40++--------------------------------------
Mgus/crawl.py | 40+---------------------------------------
Agus/lib/db_model.py | 48++++++++++++++++++++++++++++++++++++++++++++++++
Mgus/lib/index_statistics.py | 2+-
Mserve/models.py | 15++-------------
5 files changed, 54 insertions(+), 91 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -28,9 +28,10 @@ from whoosh.query import Every from whoosh.qparser import QueryParser from whoosh.writing import BufferedWriter +from gus.lib.db_model import init_db, Page, Link +from gus.lib.gemini import GeminiResource, GeminiRobotFileParser from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics from gus.lib.whoosh_extensions import UrlAnalyzer -from gus.lib.gemini import GeminiResource, GeminiRobotFileParser # hack(natpen): the built-in methods in urllib need to know the # Gemini protocol exists @@ -137,43 +138,6 @@ def index_page(page): index_writer.add_document(**document) -class Page(Model): - """ - All the pages crawled in Geminispace - """ - - url = TextField(unique=True, index=True) - fetchable_url = TextField(null=True) - domain = TextField(null=True) - port = IntegerField(null=True) - content_type = TextField(null=True) - charset = TextField(null=True) - # TODO: normalize lang out to handle multiple values better - lang = TextField(null=True) - content = TextField(null=True) - prompt = TextField(null=True) - size = IntegerField(null=True) # in bytes - indexed_at = DateTimeField(null=True) - - -class Link(Model): - """ - Hyperlinks between pages in Geminispace - """ - - from_page = ForeignKeyField(Page, backref="outbound_links") - to_page = ForeignKeyField(Page, backref="backlinks") - - -def init_db(filename=":memory:"): - """ - Bind an SQLite database to the Peewee ORM models. - """ - models = [Page, Link] - db = SqliteDatabase(filename) - db.bind(models) - db.create_tables(models) - return db def build_index(should_run_destructive=False): diff --git a/gus/crawl.py b/gus/crawl.py @@ -21,6 +21,7 @@ from peewee import ( TextField, ) +from gus.lib.db_model import init_db, Page, Link from gus.lib.gemini import GeminiResource, GeminiRobotFileParser # hack(natpen): the built-in methods in urllib need to know the @@ -403,45 +404,6 @@ def load_seed_request_urls(): return content -class Page(Model): - """ - All the pages crawled in Geminispace - """ - - url = TextField(unique=True, index=True) - fetchable_url = TextField(null=True) - domain = TextField(null=True) - port = IntegerField(null=True) - content_type = TextField(null=True) - charset = TextField(null=True) - # TODO: normalize lang out to handle multiple values better - lang = TextField(null=True) - content = TextField(null=True) - prompt = TextField(null=True) - size = IntegerField(null=True) # in bytes - indexed_at = DateTimeField(null=True) - - -class Link(Model): - """ - Hyperlinks between pages in Geminispace - """ - - from_page = ForeignKeyField(Page, backref="outbound_links") - to_page = ForeignKeyField(Page, backref="backlinks") - - -def init_db(filename=":memory:"): - """ - Bind an SQLite database to the Peewee ORM models. - """ - models = [Page, Link] - db = SqliteDatabase(filename) - db.bind(models) - db.create_tables(models) - return db - - def run_crawl(should_run_destructive=False, seed_urls=[]): # TODO: track failed domain/page attempts, and don't reattempt for 15mins diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -0,0 +1,48 @@ +from peewee import ( + BooleanField, + DateTimeField, + DoesNotExist, + FloatField, + ForeignKeyField, + IntegerField, + Model, + SqliteDatabase, + TextField, +) + +def init_db(filename=":memory:"): + """ + Bind an SQLite database to the Peewee ORM models. + """ + models = [Page, Link] + db = SqliteDatabase(filename) + db.bind(models) + db.create_tables(models) + return db + + +class Page(Model): + """ + All the pages crawled in Geminispace + """ + + url = TextField(unique=True, index=True) + fetchable_url = TextField(null=True) + domain = TextField(null=True) + port = IntegerField(null=True) + content_type = TextField(null=True) + charset = TextField(null=True) + # TODO: normalize lang out to handle multiple values better + lang = TextField(null=True) + content = TextField(null=True) + prompt = TextField(null=True) + size = IntegerField(null=True) # in bytes + indexed_at = DateTimeField(null=True) + +class Link(Model): + """ + Hyperlinks between pages in Geminispace + """ + + from_page = ForeignKeyField(Page, backref="outbound_links") + to_page = ForeignKeyField(Page, backref="backlinks") diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -6,7 +6,7 @@ from whoosh.index import open_dir from whoosh.qparser import QueryParser from whoosh.query import Every -from gus.crawl import Page, Link +from gus.lib.db_model import Page, Link from gus.lib.gemini import GeminiResource def compute_index_statistics(db): diff --git a/serve/models.py b/serve/models.py @@ -6,7 +6,7 @@ from whoosh import highlight, qparser from whoosh.index import open_dir from . import constants -from gus.crawl import Page, Link +from gus.lib.db_model import init_db, Page, Link from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file from gus.lib.misc import bytes2human @@ -24,22 +24,11 @@ class GUS(): order=highlight.SCORE, ) - self.db = GUS.init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}") + self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}") self.statistics = compute_index_statistics(self.db) self.statistics_historical_overall = load_all_statistics_from_file(constants.STATISTICS_FILE) - def init_db(filename = ":memory:"): - """ - Bind an SQLite database to the Peewee ORM models. - """ - models = [Page, Link] - db = SqliteDatabase(filename) - db.bind(models) - db.create_tables(models) - return db - - def init_query_parser(ix): or_group = qparser.OrGroup.factory(0.99) query_parser = qparser.MultifieldParser(["content", "url", "prompt"], ix.schema, group=or_group)