commit 05c5bd7b5d7c749c4fa495f21f9069f61c4f772a
parent acd728e7c407f72a0436f3a35a2658bfea3ce3f2
Author: René Wagner <rwa@clttr.info>
Date: Mon, 14 Jun 2021 09:13:51 +0200
error handling on page crawl save
Diffstat:
1 file changed, 29 insertions(+), 7 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -387,7 +387,11 @@ def crawl_page(
page_crawl = Crawl(
page=page, status=0, is_different=False, timestamp=datetime.utcnow()
)
- page_crawl.save()
+ try:
+ page_crawl.save()
+ except:
+ logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
+
failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1
logging.debug("Failed request count for host %s is %d", gr.normalized_host, failure_count[gr.normalized_host])
return
@@ -409,7 +413,10 @@ def crawl_page(
error_message=response.error_message,
timestamp=datetime.utcnow(),
)
- page_crawl.save()
+ try:
+ page_crawl.save()
+ except:
+ logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
elif response.status.startswith("5"):
# permanent error status
@@ -427,7 +434,10 @@ def crawl_page(
error_message=response.error_message,
timestamp=datetime.utcnow(),
)
- page_crawl.save()
+ try:
+ page_crawl.save()
+ except:
+ logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
elif response.status.startswith("3"):
# redirect status
@@ -459,7 +469,10 @@ def crawl_page(
is_different=False,
timestamp=datetime.utcnow(),
)
- page_crawl.save()
+ try:
+ page_crawl.save()
+ except:
+ logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
index_links(gr, [redirect_resource])
crawl_page(
redirect_resource,
@@ -483,7 +496,10 @@ def crawl_page(
is_different=False,
timestamp=datetime.utcnow(),
)
- page_crawl.save()
+ try:
+ page_crawl.save()
+ except:
+ logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
elif response.status.startswith("2"):
# success status
logging.debug(
@@ -500,7 +516,10 @@ def crawl_page(
is_different=is_different,
timestamp=datetime.utcnow(),
)
- page_crawl.save()
+ try:
+ page_crawl.save()
+ except:
+ logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
if response.content_type != "text/gemini":
logging.debug(
"Content is not gemini text: %s: %s",
@@ -526,7 +545,10 @@ def crawl_page(
is_different=False,
timestamp=datetime.utcnow(),
)
- page_crawl.save()
+ try:
+ page_crawl.save()
+ except:
+ logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url))
else:
logging.warn(
"Got unhandled status: %s: %s",