geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 05c5bd7b5d7c749c4fa495f21f9069f61c4f772a
parent acd728e7c407f72a0436f3a35a2658bfea3ce3f2
Author: René Wagner <rwa@clttr.info>
Date:   Mon, 14 Jun 2021 09:13:51 +0200

error handling on page crawl save

Diffstat:
Mgus/crawl.py | 36+++++++++++++++++++++++++++++-------
1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -387,7 +387,11 @@ def crawl_page( page_crawl = Crawl( page=page, status=0, is_different=False, timestamp=datetime.utcnow() ) - page_crawl.save() + try: + page_crawl.save() + except: + logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url)) + failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1 logging.debug("Failed request count for host %s is %d", gr.normalized_host, failure_count[gr.normalized_host]) return @@ -409,7 +413,10 @@ def crawl_page( error_message=response.error_message, timestamp=datetime.utcnow(), ) - page_crawl.save() + try: + page_crawl.save() + except: + logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url)) elif response.status.startswith("5"): # permanent error status @@ -427,7 +434,10 @@ def crawl_page( error_message=response.error_message, timestamp=datetime.utcnow(), ) - page_crawl.save() + try: + page_crawl.save() + except: + logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url)) elif response.status.startswith("3"): # redirect status @@ -459,7 +469,10 @@ def crawl_page( is_different=False, timestamp=datetime.utcnow(), ) - page_crawl.save() + try: + page_crawl.save() + except: + logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url)) index_links(gr, [redirect_resource]) crawl_page( redirect_resource, @@ -483,7 +496,10 @@ def crawl_page( is_different=False, timestamp=datetime.utcnow(), ) - page_crawl.save() + try: + page_crawl.save() + except: + logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url)) elif response.status.startswith("2"): # success status logging.debug( @@ -500,7 +516,10 @@ def crawl_page( is_different=is_different, timestamp=datetime.utcnow(), ) - page_crawl.save() + try: + page_crawl.save() + except: + logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url)) if response.content_type != "text/gemini": logging.debug( "Content is not gemini text: %s: %s", @@ -526,7 +545,10 @@ def crawl_page( is_different=False, timestamp=datetime.utcnow(), ) - page_crawl.save() + try: + page_crawl.save() + except: + logging.error("Error adding page_crawl: %s", gus.lib.logging.strip_control_chars(page.url)) else: logging.warn( "Got unhandled status: %s: %s",