generic exception handling for page crawling - geminispace.info

commit d03aa6a5969f1cade1e91aa3c9a0384bc0664981
parent cd9e7c853e636cef6532026cf6a3bd8b84c75cd7
Author: René Wagner <rwa@clttr.info>
Date:   Fri,  4 Feb 2022 09:41:08 +0100

generic exception handling for page crawling

closes #40, #41

Diffstat:
M gus/crawl.py  | 46 ++++++++++++++++++++++++++--------------------

1 file changed, 26 insertions(+), 20 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -322,9 +322,10 @@ def crawl_page(
 ):
     gr = gemini_resource
     url = gr.fetchable_url
-    if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
-        logging.debug(
-            "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url)
+    if not gemini_resource.is_valid:
+        logging.warn(
+            "Not a valid gemini resource, skipping: %s",
+            gus.lib.logging.strip_control_chars(gemini_resource.url),
         )
         return
     if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
@@ -332,18 +333,17 @@ def crawl_page(
             "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
         )
         return
-    if not gemini_resource.is_valid:
-        logging.warn(
-            "Not a valid gemini resource, skipping: %s",
-            gus.lib.logging.strip_control_chars(url),
-        )
-        return
     if should_skip(gr):
         logging.debug(
             "URL is excluded, skipping: %s",
             gus.lib.logging.strip_control_chars(url),
         )
         return
+    if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
+        logging.debug(
+            "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url)
+        )
+        return
     if should_check_if_expired:
         existing_page = Page.get_or_none(url=gr.normalized_url)
         if existing_page and existing_page.change_frequency is not None:
@@ -458,12 +458,11 @@ def crawl_page(
             return
         page = index_redirect(gr, response)
         index_links(gr, [redirect_resource])
-        crawl_page(
-            redirect_resource,
-            current_depth,
-            should_check_if_expired=True,
-            redirect_chain=redirect_chain + [gr.fetchable_url],
-        )
+        try:
+            crawl_page(redirect_resource, current_depth,
+                should_check_if_expired=True, redirect_chain=redirect_chain + [gr.fetchable_url],)
+        except Exception as e:
+            logging.error("Failed to crawl outdated resource %s with error: %s", redirect_resource.fetchable_url, e)
 
     elif response.status.startswith("1"):
         # input status
@@ -498,9 +497,10 @@ def crawl_page(
                 contained_resources = gr.extract_contained_resources(response.content)
                 index_links(gr, contained_resources)
                 for resource in contained_resources:
-                    crawl_page(
-                        resource, current_depth + 1, should_check_if_expired=True
-                    )
+                    try:
+                        crawl_page(resource, current_depth + 1, should_check_if_expired=True)
+                    except Exception as e:
+                        logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e)
         else:
             page = index_binary(gr, response)
     else:
@@ -562,10 +562,16 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
     failure_count = {}
     expired_resources = [GeminiResource(url) for url in load_expired_urls()]
     for resource in expired_resources:
-        crawl_page(resource, 0, should_check_if_expired=False)
+        try:
+            crawl_page(resource, 0, should_check_if_expired=False)
+        except Exception as e:
+            logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e)
     submitted_resources = [GeminiResource(url) for url in load_seed_request_urls()]
     for resource in submitted_resources:
-        crawl_page(resource, 0, should_check_if_expired=True)
+        try:
+            crawl_page(resource, 0, should_check_if_expired=True)
+        except Exception as e:
+            logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e)
     logging.info("Finished!")

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE