geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 5eebbbfc00555da619054e8129ad70bf3de99fd5
parent cdba245e15f25857003c2aef03946f30b0b02a1f
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri,  6 Nov 2020 07:22:02 -0500

[crawl] Strip control chars from URLs in crawl logging

Diffstat:
Mgus/crawl.py | 75++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -155,7 +155,7 @@ CRAWL_DELAYS = { def index_binary(resource, response): - logging.debug('Indexing binary for: %s', resource.indexable_url) + logging.debug('Indexing binary for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) doc = { "url": resource.indexable_url, @@ -179,7 +179,7 @@ def index_binary(resource, response): def index_redirect(resource): - logging.debug('Indexing redirect for: %s', resource.indexable_url) + logging.debug('Indexing redirect for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) doc = { "url": resource.indexable_url, @@ -200,7 +200,7 @@ def index_redirect(resource): def index_error(resource, is_temporary): - logging.debug('Indexing error for: %s', resource.indexable_url) + logging.debug('Indexing error for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) category = "temp_error" if is_temporary else "perm_error" default_change_frequency = resource.get_default_change_frequency(category) @@ -224,7 +224,7 @@ def index_error(resource, is_temporary): def index_prompt(resource, response): - logging.debug('Indexing prompt for: %s', resource.indexable_url) + logging.debug('Indexing prompt for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) doc = { "url": resource.indexable_url, @@ -249,7 +249,7 @@ def index_prompt(resource, response): def index_content(resource, response): - logging.debug('Indexing content for: %s', resource.indexable_url) + logging.debug('Indexing content for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) doc = { "url": resource.indexable_url, @@ -311,7 +311,7 @@ def index_links(from_resource, contained_resources): def fetch_robots_file(robot_host): robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt") - logging.info('Fetching robots file: %s', robot_url) + logging.info('Fetching robots file: %s', gus.lib.logging.strip_control_chars(robot_url)) rp = GeminiRobotFileParser(robot_url) rp.read() @@ -326,18 +326,18 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red gr = gemini_resource url = gr.fetchable_url if max_crawl_depth >= 0 and current_depth > max_crawl_depth: - logging.warn('Going too deep, skipping: %s', url) + logging.warn('Going too deep, skipping: %s', gus.lib.logging.strip_control_chars(url)) return if not gemini_resource.is_valid: - logging.warn('Not a valid gemini resource, skipping: %s', url) + logging.warn('Not a valid gemini resource, skipping: %s', gus.lib.logging.strip_control_chars(url)) return for excluded_prefix in EXCLUDED_URL_PREFIXES: if gr.normalized_url.startswith(excluded_prefix): - logging.info('URL prefix matches exclusion list, skipping: %s', url) + logging.info('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url)) return for excluded_path in EXCLUDED_URL_PATHS: if gr.urlsplit.path.lower().endswith(excluded_path): - logging.info('URL on exclusion list, skipping: %s', url) + logging.info('URL on exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url)) return if should_check_if_expired: @@ -345,7 +345,7 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red if existing_page and existing_page.change_frequency is not None: most_recent_crawl = Crawl.select(peewee.fn.MAX(Crawl.timestamp)).where(Crawl.page == existing_page).scalar() if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(hours=existing_page.change_frequency): - logging.debug('Recrawling too soon, skipping: %s', gr.fetchable_url) + logging.debug('Recrawling too soon, skipping: %s', gus.lib.logging.strip_control_chars(gr.fetchable_url)) return # ROBOTS @@ -365,7 +365,7 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red crawl_delay = robots_file.crawl_delay("gus") if not can_fetch: - logging.debug('Blocked by robots files, skipping: %s', url) + logging.debug('Blocked by robots files, skipping: %s', gus.lib.logging.strip_control_chars(url)) return # Crawl delay @@ -381,14 +381,14 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red domain_hit_timings[gr.normalized_host] = datetime.now() # Actually fetch! - logging.info('Fetching resource: %s', url) + logging.info('Fetching resource: %s', gus.lib.logging.strip_control_chars(url)) if gr.fully_qualified_parent_url is not None: - logging.debug('with parent: %s', gr.fully_qualified_parent_url) + logging.debug('with parent: %s', gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url)) response = gr.fetch() if response is None: # problem before getting a response - logging.warn('Failed to fetch: %s', url) + logging.warn('Failed to fetch: %s', gus.lib.logging.strip_control_chars(url)) page = index_error(gr, True) page_crawl = Crawl(page=page, status=0, @@ -398,7 +398,10 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red elif response.status.startswith("4"): # temporary error status - logging.debug('Got temporary error: %s: %s %s', url, response.status, response.error_message) + logging.debug('Got temporary error: %s: %s %s', + gus.lib.logging.strip_control_chars(url), + response.status, + response.error_message) page = index_error(gr, True) page_crawl = Crawl(page=page, status=response.status, @@ -409,7 +412,10 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red elif response.status.startswith("5"): # permanent error status - logging.debug('Got permanent error: %s: %s %s', url, response.status, response.error_message) + logging.debug('Got permanent error: %s: %s %s', + gus.lib.logging.strip_control_chars(url), + response.status, + response.error_message) page = index_error(gr, False) page_crawl = Crawl(page=page, status=response.status, @@ -420,13 +426,16 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red elif response.status.startswith("3"): # redirect status - logging.debug('Got redirected: %s: %s %s', url, response.status, response.url) + logging.debug('Got redirected: %s: %s %s', + gus.lib.logging.strip_control_chars(url), + response.status, + response.url) if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH: - logging.info('Aborting, maximum redirect chain length reached: %s', url) + logging.info('Aborting, maximum redirect chain length reached: %s', gus.lib.logging.strip_control_chars(url)) return redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host) if redirect_resource.fetchable_url == gr.fetchable_url: - logging.info('Aborting, redirecting to self: %s', url) + logging.info('Aborting, redirecting to self: %s', gus.lib.logging.strip_control_chars(url)) return page = index_redirect(gr) page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) @@ -436,13 +445,13 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red elif response.status.startswith("1"): # input status - logging.debug('Input requested at: %s: %s %s', url, response.status, response.prompt) + logging.debug('Input requested at: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.prompt) page = index_prompt(gr, response) page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) page_crawl.save() elif response.status.startswith("2"): # success status - logging.debug('Successful request: %s: %s %s', url, response.status, response.content_type) + logging.debug('Successful request: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.content_type) if response.content_type.startswith("text/"): page, is_different = index_content(gr, response) page_crawl = Crawl( @@ -453,9 +462,11 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red ) page_crawl.save() if response.content_type != "text/gemini": - logging.debug('Content is not gemini text: %s: %s', url, response.content_type) + logging.debug('Content is not gemini text: %s: %s', + gus.lib.logging.strip_control_chars(url), response.content_type) else: - logging.debug('Got gemini text, extracting and crawling links: %s', url) + logging.debug('Got gemini text, extracting and crawling links: %s', + gus.lib.logging.strip_control_chars(url)) contained_resources = gr.extract_contained_resources(response.content) index_links(gr, contained_resources) for resource in contained_resources: @@ -465,7 +476,9 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) page_crawl.save() else: - logging.warn('Got unhandled status: %s: %s', url, response.status) + logging.warn('Got unhandled status: %s: %s', + gus.lib.logging.strip_control_chars(url), + response.status) def pickle_robot_file_map(robot_file_map, index_dir): @@ -537,7 +550,9 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE): now = time.time() interval = int(now - last) if interval < 5: - logging.warn('Declining to hit %s again after only %d seconds', feed_resource.normalized_host, interval) + logging.warn('Declining to hit %s again after only %d seconds', + gus.lib.logging.strip_control_chars(feed_resource.normalized_host), + interval) feed_urls.insert(0, feed_url) skips += 1 if skips == len(feed_urls): @@ -548,11 +563,13 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE): skips = 0 # Good to go - logging.info('Fetching feed: %s', feed_url) + logging.info('Fetching feed: %s', + gus.lib.logging.strip_control_chars(feed_url)) try: resp = feed_resource.fetch() except: - logging.info('Error fetching feed, skipping: %s', feed_url) + logging.info('Error fetching feed, skipping: %s', + gus.lib.logging.strip_control_chars(feed_url)) continue if resp and resp.status == "20": last_accessed[feed_resource.normalized_host] = time.time() @@ -578,7 +595,7 @@ def recrawl_feeds(): crawl_page(resource, 0) pickle_robot_file_map(robot_file_map, index_dir) - logging.debug('Recrawled feeds: %s', content_urls) + logging.debug('Recrawled feeds: %s', gus.lib.logging.strip_control_chars(content_urls)) logging.info('Finished!')