commit 5eebbbfc00555da619054e8129ad70bf3de99fd5
parent cdba245e15f25857003c2aef03946f30b0b02a1f
Author: Natalie Pendragon <natpen@natpen.net>
Date: Fri, 6 Nov 2020 07:22:02 -0500
[crawl] Strip control chars from URLs in crawl logging
Diffstat:
M | gus/crawl.py | | | 75 | ++++++++++++++++++++++++++++++++++++++++++++++----------------------------- |
1 file changed, 46 insertions(+), 29 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -155,7 +155,7 @@ CRAWL_DELAYS = {
def index_binary(resource, response):
- logging.debug('Indexing binary for: %s', resource.indexable_url)
+ logging.debug('Indexing binary for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
doc = {
"url": resource.indexable_url,
@@ -179,7 +179,7 @@ def index_binary(resource, response):
def index_redirect(resource):
- logging.debug('Indexing redirect for: %s', resource.indexable_url)
+ logging.debug('Indexing redirect for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
doc = {
"url": resource.indexable_url,
@@ -200,7 +200,7 @@ def index_redirect(resource):
def index_error(resource, is_temporary):
- logging.debug('Indexing error for: %s', resource.indexable_url)
+ logging.debug('Indexing error for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
category = "temp_error" if is_temporary else "perm_error"
default_change_frequency = resource.get_default_change_frequency(category)
@@ -224,7 +224,7 @@ def index_error(resource, is_temporary):
def index_prompt(resource, response):
- logging.debug('Indexing prompt for: %s', resource.indexable_url)
+ logging.debug('Indexing prompt for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
doc = {
"url": resource.indexable_url,
@@ -249,7 +249,7 @@ def index_prompt(resource, response):
def index_content(resource, response):
- logging.debug('Indexing content for: %s', resource.indexable_url)
+ logging.debug('Indexing content for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
doc = {
"url": resource.indexable_url,
@@ -311,7 +311,7 @@ def index_links(from_resource, contained_resources):
def fetch_robots_file(robot_host):
robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt")
- logging.info('Fetching robots file: %s', robot_url)
+ logging.info('Fetching robots file: %s', gus.lib.logging.strip_control_chars(robot_url))
rp = GeminiRobotFileParser(robot_url)
rp.read()
@@ -326,18 +326,18 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
gr = gemini_resource
url = gr.fetchable_url
if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
- logging.warn('Going too deep, skipping: %s', url)
+ logging.warn('Going too deep, skipping: %s', gus.lib.logging.strip_control_chars(url))
return
if not gemini_resource.is_valid:
- logging.warn('Not a valid gemini resource, skipping: %s', url)
+ logging.warn('Not a valid gemini resource, skipping: %s', gus.lib.logging.strip_control_chars(url))
return
for excluded_prefix in EXCLUDED_URL_PREFIXES:
if gr.normalized_url.startswith(excluded_prefix):
- logging.info('URL prefix matches exclusion list, skipping: %s', url)
+ logging.info('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url))
return
for excluded_path in EXCLUDED_URL_PATHS:
if gr.urlsplit.path.lower().endswith(excluded_path):
- logging.info('URL on exclusion list, skipping: %s', url)
+ logging.info('URL on exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url))
return
if should_check_if_expired:
@@ -345,7 +345,7 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
if existing_page and existing_page.change_frequency is not None:
most_recent_crawl = Crawl.select(peewee.fn.MAX(Crawl.timestamp)).where(Crawl.page == existing_page).scalar()
if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(hours=existing_page.change_frequency):
- logging.debug('Recrawling too soon, skipping: %s', gr.fetchable_url)
+ logging.debug('Recrawling too soon, skipping: %s', gus.lib.logging.strip_control_chars(gr.fetchable_url))
return
# ROBOTS
@@ -365,7 +365,7 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
crawl_delay = robots_file.crawl_delay("gus")
if not can_fetch:
- logging.debug('Blocked by robots files, skipping: %s', url)
+ logging.debug('Blocked by robots files, skipping: %s', gus.lib.logging.strip_control_chars(url))
return
# Crawl delay
@@ -381,14 +381,14 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
domain_hit_timings[gr.normalized_host] = datetime.now()
# Actually fetch!
- logging.info('Fetching resource: %s', url)
+ logging.info('Fetching resource: %s', gus.lib.logging.strip_control_chars(url))
if gr.fully_qualified_parent_url is not None:
- logging.debug('with parent: %s', gr.fully_qualified_parent_url)
+ logging.debug('with parent: %s', gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url))
response = gr.fetch()
if response is None:
# problem before getting a response
- logging.warn('Failed to fetch: %s', url)
+ logging.warn('Failed to fetch: %s', gus.lib.logging.strip_control_chars(url))
page = index_error(gr, True)
page_crawl = Crawl(page=page,
status=0,
@@ -398,7 +398,10 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
elif response.status.startswith("4"):
# temporary error status
- logging.debug('Got temporary error: %s: %s %s', url, response.status, response.error_message)
+ logging.debug('Got temporary error: %s: %s %s',
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ response.error_message)
page = index_error(gr, True)
page_crawl = Crawl(page=page,
status=response.status,
@@ -409,7 +412,10 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
elif response.status.startswith("5"):
# permanent error status
- logging.debug('Got permanent error: %s: %s %s', url, response.status, response.error_message)
+ logging.debug('Got permanent error: %s: %s %s',
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ response.error_message)
page = index_error(gr, False)
page_crawl = Crawl(page=page,
status=response.status,
@@ -420,13 +426,16 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
elif response.status.startswith("3"):
# redirect status
- logging.debug('Got redirected: %s: %s %s', url, response.status, response.url)
+ logging.debug('Got redirected: %s: %s %s',
+ gus.lib.logging.strip_control_chars(url),
+ response.status,
+ response.url)
if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH:
- logging.info('Aborting, maximum redirect chain length reached: %s', url)
+ logging.info('Aborting, maximum redirect chain length reached: %s', gus.lib.logging.strip_control_chars(url))
return
redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host)
if redirect_resource.fetchable_url == gr.fetchable_url:
- logging.info('Aborting, redirecting to self: %s', url)
+ logging.info('Aborting, redirecting to self: %s', gus.lib.logging.strip_control_chars(url))
return
page = index_redirect(gr)
page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
@@ -436,13 +445,13 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
elif response.status.startswith("1"):
# input status
- logging.debug('Input requested at: %s: %s %s', url, response.status, response.prompt)
+ logging.debug('Input requested at: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.prompt)
page = index_prompt(gr, response)
page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
page_crawl.save()
elif response.status.startswith("2"):
# success status
- logging.debug('Successful request: %s: %s %s', url, response.status, response.content_type)
+ logging.debug('Successful request: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.content_type)
if response.content_type.startswith("text/"):
page, is_different = index_content(gr, response)
page_crawl = Crawl(
@@ -453,9 +462,11 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
)
page_crawl.save()
if response.content_type != "text/gemini":
- logging.debug('Content is not gemini text: %s: %s', url, response.content_type)
+ logging.debug('Content is not gemini text: %s: %s',
+ gus.lib.logging.strip_control_chars(url), response.content_type)
else:
- logging.debug('Got gemini text, extracting and crawling links: %s', url)
+ logging.debug('Got gemini text, extracting and crawling links: %s',
+ gus.lib.logging.strip_control_chars(url))
contained_resources = gr.extract_contained_resources(response.content)
index_links(gr, contained_resources)
for resource in contained_resources:
@@ -465,7 +476,9 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
page_crawl.save()
else:
- logging.warn('Got unhandled status: %s: %s', url, response.status)
+ logging.warn('Got unhandled status: %s: %s',
+ gus.lib.logging.strip_control_chars(url),
+ response.status)
def pickle_robot_file_map(robot_file_map, index_dir):
@@ -537,7 +550,9 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
now = time.time()
interval = int(now - last)
if interval < 5:
- logging.warn('Declining to hit %s again after only %d seconds', feed_resource.normalized_host, interval)
+ logging.warn('Declining to hit %s again after only %d seconds',
+ gus.lib.logging.strip_control_chars(feed_resource.normalized_host),
+ interval)
feed_urls.insert(0, feed_url)
skips += 1
if skips == len(feed_urls):
@@ -548,11 +563,13 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
skips = 0
# Good to go
- logging.info('Fetching feed: %s', feed_url)
+ logging.info('Fetching feed: %s',
+ gus.lib.logging.strip_control_chars(feed_url))
try:
resp = feed_resource.fetch()
except:
- logging.info('Error fetching feed, skipping: %s', feed_url)
+ logging.info('Error fetching feed, skipping: %s',
+ gus.lib.logging.strip_control_chars(feed_url))
continue
if resp and resp.status == "20":
last_accessed[feed_resource.normalized_host] = time.time()
@@ -578,7 +595,7 @@ def recrawl_feeds():
crawl_page(resource, 0)
pickle_robot_file_map(robot_file_map, index_dir)
- logging.debug('Recrawled feeds: %s', content_urls)
+ logging.debug('Recrawled feeds: %s', gus.lib.logging.strip_control_chars(content_urls))
logging.info('Finished!')