geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit d4093761e14f5f105f1b51f53cdeac814640b19b
parent 123895e2f0dfff0c35a023e037b98e3c22f12d35
Author: René Wagner <rwa@clttr.info>
Date:   Fri,  9 Jul 2021 22:05:55 +0200

improve indexing speed via optimized backlinks query

the query to calculate backlinks caused massive delays during indexing.
An unused join to the `crawl` table caused this behavior.

After removing the join, speed is very fast again.

Diffstat:
Mgus/build_index.py | 8++------
Mgus/crawl.py | 5-----
Mserve/models.py | 8++------
3 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -44,12 +44,8 @@ def index_page(index, page, indexed_urls): external_backlinks = Page.raw( """SELECT p_from.url FROM page AS p_from -JOIN indexable_crawl AS ic -ON ic.page_id == p_from.id -JOIN link as l -ON l.from_page_id == p_from.id -JOIN page as p_to -ON p_to.id == l.to_page_id +JOIN link as l ON l.from_page_id == p_from.id +JOIN page as p_to ON p_to.id == l.to_page_id WHERE p_to.url IN (?, ?) AND l.is_cross_host_like == 1 GROUP BY p_from.normalized_url""", diff --git a/gus/crawl.py b/gus/crawl.py @@ -109,11 +109,6 @@ def index_redirect(resource): def index_error(resource, is_temporary): - logging.debug( - "Indexing error for: %s", - gus.lib.logging.strip_control_chars(resource.indexable_url), - ) - category = "temp_error" if is_temporary else "perm_error" default_change_frequency = resource.get_default_change_frequency(category) doc = { diff --git a/serve/models.py b/serve/models.py @@ -56,12 +56,8 @@ class GUS: backlinks_query = Page.raw( """SELECT p_from.url, l.is_cross_host_like FROM page AS p_from -JOIN indexable_crawl AS ic -ON ic.page_id == p_from.id -JOIN link as l -ON l.from_page_id == p_from.id -JOIN page as p_to -ON p_to.id == l.to_page_id +JOIN link as l ON l.from_page_id == p_from.id +JOIN page as p_to ON p_to.id == l.to_page_id WHERE p_to.url IN (?, ?) AND p_from.normalized_url != ? GROUP BY p_from.normalized_url