geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 93722e67599abd7a2372f2324cd6e8925f159112
parent 632a4cb16c0d114b83b105cf00d6743649f2d40d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 28 Jul 2020 07:04:06 -0400

[crawl] Add change_frequency backoff

Diffstat:
Mgus/crawl.py | 17+++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -158,6 +158,8 @@ def index_binary(resource, response): existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id + if existing_page.change_frequency: + doc["change_frequency"] = existing_page.change_frequency + 24 * 7 doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] page = Page(**doc) page.save() @@ -177,7 +179,8 @@ def index_redirect(resource): existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] + if existing_page.change_frequency: + doc["change_frequency"] = existing_page.change_frequency + 24 * 2 page = Page(**doc) page.save() return page @@ -196,7 +199,11 @@ def index_error(resource, is_temporary): existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] + if existing_page.change_frequency: + if is_temporary: + doc["change_frequency"] = existing_page.change_frequency + 12 + else: + doc["change_frequency"] = existing_page.change_frequency + 24 * 30 page = Page(**doc) page.save() return page @@ -220,7 +227,8 @@ def index_prompt(resource, response): existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] + if existing_page.change_frequency: + doc["change_frequency"] = existing_page.change_frequency + 24 * 7 page = Page(**doc) page.save() return page @@ -247,8 +255,9 @@ def index_content(resource, response): is_different = False if existing_page: doc["id"] = existing_page.id - doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] is_different = existing_page.content is not None and doc["content"] != existing_page.content + if existing_page.change_frequency and is_different and not (resource.is_root_like or resource.is_log_root_like): + doc["change_frequency"] = existing_page.change_frequency + 24 * 3 page = Page(**doc) page.save() return page, is_different