geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 2174883dd629450213786b2997ebb7b450fe385e
parent cd8b669e4d7453d1bc4d49eecb378a386d19db1b
Author: René Wagner <rwa@clttr.info>
Date:   Fri,  6 Jan 2023 10:09:36 +0100

update db schema for improved query performance

Diffstat:
Mgus/build_index.py | 4++--
Mgus/crawl.py | 5+----
Mgus/lib/db_model.py | 5+++--
Mserve/models.py | 12++++--------
Mserve/templates/news.gmi | 6++++++
5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -88,7 +88,7 @@ def build_index(should_run_destructive=False): # delete pages that never successfull crawled count=0 - q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) + q = Page.select(Page.url).where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) for page in q.iterator(): try: index.delete_by_term("url_id", page.url) @@ -120,7 +120,7 @@ def build_index(should_run_destructive=False): # delete pages with last crawl success older than 30 days which have been recrawled since than # this avoids deletion of files that have a change_frequency longer than our timeout count=0 - q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) + q = Page.select(Page.url).where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) for page in q.iterator(): try: index.delete_by_term("url_id", page.url) diff --git a/gus/crawl.py b/gus/crawl.py @@ -514,10 +514,7 @@ def crawl_page( def load_expired_urls(): - expired_pages = Page.raw( - """SELECT p.url - FROM page as p -WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') OR last_crawl_at IS NULL""" ) + expired_pages = Page.select(Page.ur).where(last_crawl_at < (datetime.now() + timedelta(hours=(Page.change_frequency * -1))) & Page.last_crawl_at.is_null(True)) return [page.url for page in expired_pages.execute()] def load_seed_request_urls(): diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -35,7 +35,7 @@ class Page(Model): url = TextField(unique=True, index=True) fetchable_url = TextField(null=True) - domain = TextField(null=True) + domain = TextField(null=True, index=True) port = IntegerField(null=True) content_type = TextField(null=True) charset = TextField(null=True) @@ -54,7 +54,8 @@ class Page(Model): first_seen_at = DateTimeField(null=True) class Meta: indexes=( - (('last_crawl_success_at', 'last_success_status', 'first_seen_at', 'domain'), False), + (('last_success_status', 'first_seen_at', 'indexed_at', 'domain', 'url', 'content_type', 'fetchable_url'), False), + (('last_crawl_at', 'last_crawl_success_at'), False) ) diff --git a/serve/models.py b/serve/models.py @@ -25,8 +25,7 @@ class GUS: """ SELECT DISTINCT p.domain FROM page AS p - WHERE last_crawl_success_at IS NOT NULL - AND last_success_status = 20 + WHERE last_success_status = 20 ORDER BY p.domain """ ) @@ -36,8 +35,7 @@ class GUS: """ SELECT p.domain, p.first_seen_at FROM page AS p - WHERE last_crawl_success_at IS NOT NULL - AND last_success_status = 20 + WHERE last_success_status = 20 AND first_seen_at IS NOT NULL GROUP BY p.domain ORDER BY first_seen_at DESC @@ -48,8 +46,7 @@ class GUS: newest_pages_query = Page.raw( """SELECT p.url, p.fetchable_url, p.first_seen_at FROM page as p - WHERE last_crawl_success_at IS NOT NULL - AND last_success_status = 20 + WHERE last_success_status = 20 AND first_seen_at IS NOT NULL ORDER BY first_seen_at DESC LIMIT 50""") @@ -58,8 +55,7 @@ class GUS: feeds_query = Page.raw( """SELECT DISTINCT p.url, p.fetchable_url FROM page AS p - WHERE p.last_crawl_success_at IS NOT NULL - AND last_success_status = 20 + WHERE last_success_status = 20 AND (p.url LIKE '%atom.xml' OR p.url LIKE '%feed.xml' OR p.url LIKE '%.rss' diff --git a/serve/templates/news.gmi b/serve/templates/news.gmi @@ -2,6 +2,12 @@ ## News +### 2023-01-05 +I've made some adjustments to the raw database for some major performance improvements. This helps mostly when we update the index or restart the server, it does not affect searching on geminispace.info. + +Due to the announced price increases from our hoster I'm thinking about hosting geminispace.info on a spare RasPi at home. The downside would be that geminispace.info would change it's IP every 24 hours - so IP-based blocking of the crawler would be impossible. +Do you think this is acceptable? Feedback welcome. + ### 2023-01-01 Happy new year everyone, hope your all doing well. Our provider (netcup.de) anounced a price increasing - so maybe we are going to migrate to another provider some day in June. We need atleast 2 vCPUs (4 will be better), 8 GB of RAM and atleast 100 GB block storage. Suggestions welcome.