geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 16fe7694d947472b0a0b60b2bdb57cbed61287ca
parent a8879dd5d3b05f96e5fdae17a3551d5bccd5aa76
Author: Rene Wagner <rwa@clttr.info>
Date:   Thu,  9 Mar 2023 11:32:05 +0000

fix storing of page content

Diffstat:
Mgus/crawl.py | 37+++++++++++++++++++------------------
1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -207,7 +207,7 @@ def index_prompt(resource, response): def index_content(resource, response): logging.debug( - "Indexing content for: %s", + "Storing content for: %s", strip_control_chars(resource.normalized_url), ) @@ -235,20 +235,20 @@ def index_content(resource, response): doc["id"] = existing_page.id if not (existing_page.first_seen_at is None): doc["first_seen_at"] = existing_page.first_seen_at - if existing_page.content: - is_different = doc["content"] != existing_page.content - if is_different: - doc["change_frequency"] = resource.get_default_change_frequency( - "content" - ) - else: - existing_change_frequency = ( - existing_page.change_frequency - or resource.get_default_change_frequency("content") - ) - doc["change_frequency"] = resource.increment_change_frequency( - existing_change_frequency, "content" - ) + + existing_pagecontent = PageContent.get_or_none(page_id=existing_page.id) + is_different = existing_pagecontent is None or doc["content"] != existing_pagecontent.content + if is_different: + doc["change_frequency"] = resource.get_default_change_frequency("content") + else: + existing_change_frequency = ( + existing_page.change_frequency + or resource.get_default_change_frequency("content") + ) + doc["change_frequency"] = resource.increment_change_frequency( + existing_change_frequency, "content" + ) + page = Page(**doc) try: page.save() @@ -258,14 +258,15 @@ def index_content(resource, response): "prompt": None, "content": response.content } + existing_pagecontent = PageContent.get_or_none(page_id=page.id) if existing_pagecontent: content["id"] = existing_pagecontent.id - + pagecontent = PageContent(**content) pagecontent.save() - except: - logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) + except Exception as e: + logging.error("Error adding page %s: %s", strip_control_chars(resource.normalized_url), e) return page, is_different