geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit b484a4dadc7b58eac2f13188a748f64a83d58a26
parent f928815d49189a5fcfa7753ed578ac36532ffc82
Author: René Wagner <rwa@clttr.info>
Date:   Mon, 11 Oct 2021 20:03:08 +0200

avoid crash when normalized_url is not set

fixes #34

Diffstat:
Mgus/crawl.py | 27++++++++++++++++-----------
1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -244,18 +244,23 @@ def index_content(resource, response): def should_skip(resource): should_skip = False - for excluded_prefix in EXCLUDED_URL_PREFIXES: - if resource.normalized_url.startswith(excluded_prefix): - should_skip = True - break - for excluded_path in EXCLUDED_URL_PATHS: - if resource.urlsplit.path.lower().endswith(excluded_path): + try: + for excluded_prefix in EXCLUDED_URL_PREFIXES: + if resource.normalized_url.startswith(excluded_prefix): + should_skip = True + break + for excluded_path in EXCLUDED_URL_PATHS: + if resource.urlsplit.path.lower().endswith(excluded_path): + should_skip = True + break + m = EXCLUDED_URL_PATTERN.match(resource.normalized_url) + if m: should_skip = True - break - m = EXCLUDED_URL_PATTERN.match(resource.normalized_url) - if m: - should_skip = True - return should_skip + except: + logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) + should_skip = True + + return should_skip def index_links(from_resource, contained_resources):