commit 16fe7694d947472b0a0b60b2bdb57cbed61287ca
parent a8879dd5d3b05f96e5fdae17a3551d5bccd5aa76
Author: Rene Wagner <rwa@clttr.info>
Date: Thu, 9 Mar 2023 11:32:05 +0000
fix storing of page content
Diffstat:
1 file changed, 19 insertions(+), 18 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -207,7 +207,7 @@ def index_prompt(resource, response):
def index_content(resource, response):
logging.debug(
- "Indexing content for: %s",
+ "Storing content for: %s",
strip_control_chars(resource.normalized_url),
)
@@ -235,20 +235,20 @@ def index_content(resource, response):
doc["id"] = existing_page.id
if not (existing_page.first_seen_at is None):
doc["first_seen_at"] = existing_page.first_seen_at
- if existing_page.content:
- is_different = doc["content"] != existing_page.content
- if is_different:
- doc["change_frequency"] = resource.get_default_change_frequency(
- "content"
- )
- else:
- existing_change_frequency = (
- existing_page.change_frequency
- or resource.get_default_change_frequency("content")
- )
- doc["change_frequency"] = resource.increment_change_frequency(
- existing_change_frequency, "content"
- )
+
+ existing_pagecontent = PageContent.get_or_none(page_id=existing_page.id)
+ is_different = existing_pagecontent is None or doc["content"] != existing_pagecontent.content
+ if is_different:
+ doc["change_frequency"] = resource.get_default_change_frequency("content")
+ else:
+ existing_change_frequency = (
+ existing_page.change_frequency
+ or resource.get_default_change_frequency("content")
+ )
+ doc["change_frequency"] = resource.increment_change_frequency(
+ existing_change_frequency, "content"
+ )
+
page = Page(**doc)
try:
page.save()
@@ -258,14 +258,15 @@ def index_content(resource, response):
"prompt": None,
"content": response.content
}
+
existing_pagecontent = PageContent.get_or_none(page_id=page.id)
if existing_pagecontent:
content["id"] = existing_pagecontent.id
-
+
pagecontent = PageContent(**content)
pagecontent.save()
- except:
- logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
+ except Exception as e:
+ logging.error("Error adding page %s: %s", strip_control_chars(resource.normalized_url), e)
return page, is_different