avoid crash when normalized_url is not set - geminispace.info

commit b484a4dadc7b58eac2f13188a748f64a83d58a26
parent f928815d49189a5fcfa7753ed578ac36532ffc82
Author: René Wagner <rwa@clttr.info>
Date:   Mon, 11 Oct 2021 20:03:08 +0200

avoid crash when normalized_url is not set

fixes #34

Diffstat:
M gus/crawl.py  | 27 ++++++++++++++++-----------

1 file changed, 16 insertions(+), 11 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -244,18 +244,23 @@ def index_content(resource, response):
 
 def should_skip(resource):
     should_skip = False
-    for excluded_prefix in EXCLUDED_URL_PREFIXES:
-        if resource.normalized_url.startswith(excluded_prefix):
-            should_skip = True
-            break
-    for excluded_path in EXCLUDED_URL_PATHS:
-        if resource.urlsplit.path.lower().endswith(excluded_path):
+    try:
+        for excluded_prefix in EXCLUDED_URL_PREFIXES:
+            if resource.normalized_url.startswith(excluded_prefix):
+                should_skip = True
+                break
+        for excluded_path in EXCLUDED_URL_PATHS:
+            if resource.urlsplit.path.lower().endswith(excluded_path):
+                should_skip = True
+                break
+        m = EXCLUDED_URL_PATTERN.match(resource.normalized_url)
+        if m:
             should_skip = True
-            break
-    m = EXCLUDED_URL_PATTERN.match(resource.normalized_url)
-    if m:
-        should_skip = True
-    return should_skip
+    except:
+        logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+		should_skip = True
+
+	return should_skip
 
 
 def index_links(from_resource, contained_resources):

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE