commit b484a4dadc7b58eac2f13188a748f64a83d58a26 parent f928815d49189a5fcfa7753ed578ac36532ffc82 Author: René Wagner <rwa@clttr.info> Date: Mon, 11 Oct 2021 20:03:08 +0200 avoid crash when normalized_url is not set fixes #34 Diffstat:
M | gus/crawl.py | | | 27 | ++++++++++++++++----------- |
1 file changed, 16 insertions(+), 11 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py @@ -244,18 +244,23 @@ def index_content(resource, response): def should_skip(resource): should_skip = False - for excluded_prefix in EXCLUDED_URL_PREFIXES: - if resource.normalized_url.startswith(excluded_prefix): - should_skip = True - break - for excluded_path in EXCLUDED_URL_PATHS: - if resource.urlsplit.path.lower().endswith(excluded_path): + try: + for excluded_prefix in EXCLUDED_URL_PREFIXES: + if resource.normalized_url.startswith(excluded_prefix): + should_skip = True + break + for excluded_path in EXCLUDED_URL_PATHS: + if resource.urlsplit.path.lower().endswith(excluded_path): + should_skip = True + break + m = EXCLUDED_URL_PATTERN.match(resource.normalized_url) + if m: should_skip = True - break - m = EXCLUDED_URL_PATTERN.match(resource.normalized_url) - if m: - should_skip = True - return should_skip + except: + logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) + should_skip = True + + return should_skip def index_links(from_resource, contained_resources):