commit d4093761e14f5f105f1b51f53cdeac814640b19b
parent 123895e2f0dfff0c35a023e037b98e3c22f12d35
Author: René Wagner <rwa@clttr.info>
Date: Fri, 9 Jul 2021 22:05:55 +0200
improve indexing speed via optimized backlinks query
the query to calculate backlinks caused massive delays during indexing.
An unused join to the `crawl` table caused this behavior.
After removing the join, speed is very fast again.
Diffstat:
3 files changed, 4 insertions(+), 17 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -44,12 +44,8 @@ def index_page(index, page, indexed_urls):
external_backlinks = Page.raw(
"""SELECT p_from.url
FROM page AS p_from
-JOIN indexable_crawl AS ic
-ON ic.page_id == p_from.id
-JOIN link as l
-ON l.from_page_id == p_from.id
-JOIN page as p_to
-ON p_to.id == l.to_page_id
+JOIN link as l ON l.from_page_id == p_from.id
+JOIN page as p_to ON p_to.id == l.to_page_id
WHERE p_to.url IN (?, ?)
AND l.is_cross_host_like == 1
GROUP BY p_from.normalized_url""",
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -109,11 +109,6 @@ def index_redirect(resource):
def index_error(resource, is_temporary):
- logging.debug(
- "Indexing error for: %s",
- gus.lib.logging.strip_control_chars(resource.indexable_url),
- )
-
category = "temp_error" if is_temporary else "perm_error"
default_change_frequency = resource.get_default_change_frequency(category)
doc = {
diff --git a/serve/models.py b/serve/models.py
@@ -56,12 +56,8 @@ class GUS:
backlinks_query = Page.raw(
"""SELECT p_from.url, l.is_cross_host_like
FROM page AS p_from
-JOIN indexable_crawl AS ic
-ON ic.page_id == p_from.id
-JOIN link as l
-ON l.from_page_id == p_from.id
-JOIN page as p_to
-ON p_to.id == l.to_page_id
+JOIN link as l ON l.from_page_id == p_from.id
+JOIN page as p_to ON p_to.id == l.to_page_id
WHERE p_to.url IN (?, ?)
AND p_from.normalized_url != ?
GROUP BY p_from.normalized_url