commit 93722e67599abd7a2372f2324cd6e8925f159112
parent 632a4cb16c0d114b83b105cf00d6743649f2d40d
Author: Natalie Pendragon <natpen@natpen.net>
Date: Tue, 28 Jul 2020 07:04:06 -0400
[crawl] Add change_frequency backoff
Diffstat:
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -158,6 +158,8 @@ def index_binary(resource, response):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
+ if existing_page.change_frequency:
+ doc["change_frequency"] = existing_page.change_frequency + 24 * 7
doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"]
page = Page(**doc)
page.save()
@@ -177,7 +179,8 @@ def index_redirect(resource):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"]
+ if existing_page.change_frequency:
+ doc["change_frequency"] = existing_page.change_frequency + 24 * 2
page = Page(**doc)
page.save()
return page
@@ -196,7 +199,11 @@ def index_error(resource, is_temporary):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"]
+ if existing_page.change_frequency:
+ if is_temporary:
+ doc["change_frequency"] = existing_page.change_frequency + 12
+ else:
+ doc["change_frequency"] = existing_page.change_frequency + 24 * 30
page = Page(**doc)
page.save()
return page
@@ -220,7 +227,8 @@ def index_prompt(resource, response):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
- doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"]
+ if existing_page.change_frequency:
+ doc["change_frequency"] = existing_page.change_frequency + 24 * 7
page = Page(**doc)
page.save()
return page
@@ -247,8 +255,9 @@ def index_content(resource, response):
is_different = False
if existing_page:
doc["id"] = existing_page.id
- doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"]
is_different = existing_page.content is not None and doc["content"] != existing_page.content
+ if existing_page.change_frequency and is_different and not (resource.is_root_like or resource.is_log_root_like):
+ doc["change_frequency"] = existing_page.change_frequency + 24 * 3
page = Page(**doc)
page.save()
return page, is_different