commit a9e9cf27d5460a956444d03cef784da70fb856bd
parent 87ef15df2ebeb3bd17e3a69fb296f0ed657c4814
Author: René Wagner <rwa@clttr.info>
Date: Mon, 12 Jul 2021 14:58:33 +0200
some tweaks to indexing
- simplify backlinks counter query
- only count successfull crawled domains as known domains
- increase default root recrawl time
Diffstat:
3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -40,10 +40,9 @@ def index_page(index, page):
FROM page AS p_from
JOIN link as l ON l.from_page_id == p_from.id
JOIN page as p_to ON p_to.id == l.to_page_id
-WHERE p_to.url IN (?, ?)
+WHERE p_to.url == ?
AND l.is_cross_host_like == 1""",
- u,
- f"{u}/",
+ u
)
logging.debug("Calculating backlinks for %s", u)
diff --git a/gus/constants.py b/gus/constants.py
@@ -6,10 +6,10 @@ DB_FILENAME = "gus.sqlite"
FEED_FILE = "feeds.txt"
MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
MAXIMUM_FAILED_REQUEST_COUNT = 5
-MAXIMUM_TEXT_PAGE_SIZE = 1000000 # 1000KB, in bytes
+MAXIMUM_TEXT_PAGE_SIZE = 1024000 # 1000KB, in bytes
# default change frequencies (in hours)
-ROOT_CHANGE_FREQUENCY_DEFAULT = 12
+ROOT_CHANGE_FREQUENCY_DEFAULT = 24
ROOT_CHANGE_FREQUENCY_INCREMENT = 1
NON_ROOT_CHANGE_FREQUENCY_DEFAULT = 24 * 7
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -9,7 +9,8 @@ from gus.lib.db_model import Page
def compute_index_statistics(db):
page_count = len(Page.raw("""SELECT DISTINCT p.id
-FROM page AS p""").dicts())
+FROM page AS p
+WHERE last_crawl_success_at IS NOT NULL""").dicts())
domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port
FROM page AS p