commit d03aa6a5969f1cade1e91aa3c9a0384bc0664981
parent cd9e7c853e636cef6532026cf6a3bd8b84c75cd7
Author: René Wagner <rwa@clttr.info>
Date: Fri, 4 Feb 2022 09:41:08 +0100
generic exception handling for page crawling
closes #40, #41
Diffstat:
M | gus/crawl.py | | | 46 | ++++++++++++++++++++++++++-------------------- |
1 file changed, 26 insertions(+), 20 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -322,9 +322,10 @@ def crawl_page(
):
gr = gemini_resource
url = gr.fetchable_url
- if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
- logging.debug(
- "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url)
+ if not gemini_resource.is_valid:
+ logging.warn(
+ "Not a valid gemini resource, skipping: %s",
+ gus.lib.logging.strip_control_chars(gemini_resource.url),
)
return
if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
@@ -332,18 +333,17 @@ def crawl_page(
"Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
)
return
- if not gemini_resource.is_valid:
- logging.warn(
- "Not a valid gemini resource, skipping: %s",
- gus.lib.logging.strip_control_chars(url),
- )
- return
if should_skip(gr):
logging.debug(
"URL is excluded, skipping: %s",
gus.lib.logging.strip_control_chars(url),
)
return
+ if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
+ logging.debug(
+ "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url)
+ )
+ return
if should_check_if_expired:
existing_page = Page.get_or_none(url=gr.normalized_url)
if existing_page and existing_page.change_frequency is not None:
@@ -458,12 +458,11 @@ def crawl_page(
return
page = index_redirect(gr, response)
index_links(gr, [redirect_resource])
- crawl_page(
- redirect_resource,
- current_depth,
- should_check_if_expired=True,
- redirect_chain=redirect_chain + [gr.fetchable_url],
- )
+ try:
+ crawl_page(redirect_resource, current_depth,
+ should_check_if_expired=True, redirect_chain=redirect_chain + [gr.fetchable_url],)
+ except Exception as e:
+ logging.error("Failed to crawl outdated resource %s with error: %s", redirect_resource.fetchable_url, e)
elif response.status.startswith("1"):
# input status
@@ -498,9 +497,10 @@ def crawl_page(
contained_resources = gr.extract_contained_resources(response.content)
index_links(gr, contained_resources)
for resource in contained_resources:
- crawl_page(
- resource, current_depth + 1, should_check_if_expired=True
- )
+ try:
+ crawl_page(resource, current_depth + 1, should_check_if_expired=True)
+ except Exception as e:
+ logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e)
else:
page = index_binary(gr, response)
else:
@@ -562,10 +562,16 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
failure_count = {}
expired_resources = [GeminiResource(url) for url in load_expired_urls()]
for resource in expired_resources:
- crawl_page(resource, 0, should_check_if_expired=False)
+ try:
+ crawl_page(resource, 0, should_check_if_expired=False)
+ except Exception as e:
+ logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e)
submitted_resources = [GeminiResource(url) for url in load_seed_request_urls()]
for resource in submitted_resources:
- crawl_page(resource, 0, should_check_if_expired=True)
+ try:
+ crawl_page(resource, 0, should_check_if_expired=True)
+ except Exception as e:
+ logging.error("Failed to crawl outdated resource %s with error: %s", resource.fetchable_url, e)
logging.info("Finished!")