geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit d03aa6a5969f1cade1e91aa3c9a0384bc0664981
parent cd9e7c853e636cef6532026cf6a3bd8b84c75cd7
Author: René Wagner <rwa@clttr.info>
Date:   Fri,  4 Feb 2022 09:41:08 +0100

generic exception handling for page crawling

closes #40, #41

Diffstat:
Mgus/crawl.py | 46++++++++++++++++++++++++++--------------------
1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -322,9 +322,10 @@ def crawl_page( ): gr = gemini_resource url = gr.fetchable_url - if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT: - logging.debug( - "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url) + if not gemini_resource.is_valid: + logging.warn( + "Not a valid gemini resource, skipping: %s", + gus.lib.logging.strip_control_chars(gemini_resource.url), ) return if max_crawl_depth >= 0 and current_depth > max_crawl_depth: @@ -332,18 +333,17 @@ def crawl_page( "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url) ) return - if not gemini_resource.is_valid: - logging.warn( - "Not a valid gemini resource, skipping: %s", - gus.lib.logging.strip_control_chars(url), - ) - return if should_skip(gr): logging.debug( "URL is excluded, skipping: %s", gus.lib.logging.strip_control_chars(url), ) return + if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT: + logging.debug( + "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url) + ) + return if should_check_if_expired: existing_page = Page.get_or_none(url=gr.normalized_url) if existing_page and existing_page.change_frequency is not None: @@ -458,12 +458,11 @@ def crawl_page( return page = index_redirect(gr, response) index_links(gr, [redirect_resource]) - crawl_page( - redirect_resource, - current_depth, - should_check_if_expired=True, - redirect_chain=redirect_chain + [gr.fetchable_url], - ) + try: + crawl_page(redirect_resource, current_depth, + should_check_if_expired=True, redirect_chain=redirect_chain + [gr.fetchable_url],) + except Exception as e: + logging.error("Failed to crawl outdated resource %s with error: %s", redirect_resource.fetchable_url, e) elif response.status.startswith("1"): # input status @@ -498,9 +497,10 @@ def crawl_page( contained_resources = gr.extract_contained_resources(response.content) index_links(gr, contained_resources) for resource in contained_resources: - crawl_page( - resource, current_depth + 1, should_check_if_expired=True - ) + try: + crawl_page(resource, current_depth + 1, should_check_if_expired=True) + except Exception as e: + logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e) else: page = index_binary(gr, response) else: @@ -562,10 +562,16 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): failure_count = {} expired_resources = [GeminiResource(url) for url in load_expired_urls()] for resource in expired_resources: - crawl_page(resource, 0, should_check_if_expired=False) + try: + crawl_page(resource, 0, should_check_if_expired=False) + except Exception as e: + logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e) submitted_resources = [GeminiResource(url) for url in load_seed_request_urls()] for resource in submitted_resources: - crawl_page(resource, 0, should_check_if_expired=True) + try: + crawl_page(resource, 0, should_check_if_expired=True) + except Exception as e: + logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e) logging.info("Finished!")