geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit a9e9cf27d5460a956444d03cef784da70fb856bd
parent 87ef15df2ebeb3bd17e3a69fb296f0ed657c4814
Author: René Wagner <rwa@clttr.info>
Date:   Mon, 12 Jul 2021 14:58:33 +0200

some tweaks to indexing

- simplify backlinks counter query
- only count successfull crawled domains as known domains
- increase default root recrawl time

Diffstat:
Mgus/build_index.py | 5++---
Mgus/constants.py | 4++--
Mgus/lib/index_statistics.py | 3++-
3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -40,10 +40,9 @@ def index_page(index, page): FROM page AS p_from JOIN link as l ON l.from_page_id == p_from.id JOIN page as p_to ON p_to.id == l.to_page_id -WHERE p_to.url IN (?, ?) +WHERE p_to.url == ? AND l.is_cross_host_like == 1""", - u, - f"{u}/", + u ) logging.debug("Calculating backlinks for %s", u) diff --git a/gus/constants.py b/gus/constants.py @@ -6,10 +6,10 @@ DB_FILENAME = "gus.sqlite" FEED_FILE = "feeds.txt" MAXIMUM_REDIRECT_CHAIN_LENGTH = 5 MAXIMUM_FAILED_REQUEST_COUNT = 5 -MAXIMUM_TEXT_PAGE_SIZE = 1000000 # 1000KB, in bytes +MAXIMUM_TEXT_PAGE_SIZE = 1024000 # 1000KB, in bytes # default change frequencies (in hours) -ROOT_CHANGE_FREQUENCY_DEFAULT = 12 +ROOT_CHANGE_FREQUENCY_DEFAULT = 24 ROOT_CHANGE_FREQUENCY_INCREMENT = 1 NON_ROOT_CHANGE_FREQUENCY_DEFAULT = 24 * 7 diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -9,7 +9,8 @@ from gus.lib.db_model import Page def compute_index_statistics(db): page_count = len(Page.raw("""SELECT DISTINCT p.id -FROM page AS p""").dicts()) +FROM page AS p +WHERE last_crawl_success_at IS NOT NULL""").dicts()) domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port FROM page AS p