geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 2774ab2b0ec0d9c6b0f444919cd053507d6f4a86
parent f3f9d41aa7652d1239792652eaf972985e36e84d
Author: Remco van 't Veer <remco@remworks.net>
Date:   Sat, 31 Oct 2020 16:53:41 +0100

Logging

Replace all print statements in the crawler and indexer with log
statements.  Use logging categories to distinguish between debug
information (level "debug"), progress (level "info"), and things that
might need attention at some point (level "warn").

Signed-off-by: Natalie Pendragon <natpen@natpen.net>

Diffstat:
Mgus/build_index.py | 19++++++++++++++-----
Mgus/crawl.py | 120+++++++++++++++++++++++++++++++++++++------------------------------------------
Mgus/lib/index_statistics.py | 41++++++++++++++++++++++++-----------------
Agus/lib/logging.py | 29+++++++++++++++++++++++++++++
Alogging.ini | 21+++++++++++++++++++++
5 files changed, 144 insertions(+), 86 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -1,4 +1,6 @@ import argparse +import logging + from datetime import datetime, timedelta import os import pathlib @@ -11,8 +13,9 @@ from whoosh.index import open_dir from gus.crawl import EXCLUDED_URL_PREFIXES from gus.lib.db_model import init_db, Page -from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics +from gus.lib.index_statistics import compute_index_statistics, persist_statistics, log_index_statistics from gus.lib.whoosh_extensions import UrlAnalyzer +import gus.lib.logging # hack: the built-in methods in urllib need to know the # Gemini protocol exists @@ -102,7 +105,9 @@ def index_page(page, indexed_urls): return if page.fetchable_url in indexed_urls: return - print(page.url) + + logging.info("Indexing page: %s", page.url) + u = page.url.rstrip("/") external_backlinks = Page.raw("""SELECT p_from.url FROM page AS p_from @@ -181,19 +186,24 @@ GROUP BY p.normalized_url""") for page in pages.execute(): index_page(page, indexed_urls) + + logging.debug('Committing index.') index_writer.commit() index_statistics = compute_index_statistics(db) - print_index_statistics(index_statistics) + log_index_statistics(index_statistics) persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv") # if should_run_destructive: # # replace current index with new index # shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True) # shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT) + logging.info('Finished!') + def main(): args = parse_args() + gus.lib.logging.handle_arguments(args) build_index(args.should_run_destructive, args.invalidation_window) @@ -215,8 +225,7 @@ def parse_args(): default=0, help="a recency window, in hours, for recently crawled pages that should be forcefully reindexed", ) - parser.add_argument('-o', '--output', dest='output', type=str, - default="index.gmi", help='output filename') + gus.lib.logging.add_arguments(parser) args = parser.parse_args() return args diff --git a/gus/crawl.py b/gus/crawl.py @@ -1,4 +1,6 @@ import argparse +import logging + from datetime import datetime, timedelta import os import pathlib @@ -12,6 +14,7 @@ import peewee from . import constants from gus.lib.db_model import init_db, Page, Link, Crawl from gus.lib.gemini import GeminiResource, GeminiRobotFileParser +import gus.lib.logging # hack: the built-in methods in urllib need to know the # Gemini protocol exists @@ -146,7 +149,8 @@ CRAWL_DELAYS = { def index_binary(resource, response): - print("INDEXING BINARY...") + logging.debug('Indexing binary for: %s', resource.indexable_url) + doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, @@ -169,7 +173,8 @@ def index_binary(resource, response): def index_redirect(resource): - print("INDEXING REDIRECT...") + logging.debug('Indexing redirect for: %s', resource.indexable_url) + doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, @@ -189,7 +194,8 @@ def index_redirect(resource): def index_error(resource, is_temporary): - print("INDEXING ERROR...") + logging.debug('Indexing error for: %s', resource.indexable_url) + category = "temp_error" if is_temporary else "perm_error" default_change_frequency = resource.get_default_change_frequency(category) doc = { @@ -212,7 +218,8 @@ def index_error(resource, is_temporary): def index_prompt(resource, response): - print("INDEXING PROMPT...") + logging.debug('Indexing prompt for: %s', resource.indexable_url) + doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, @@ -236,7 +243,8 @@ def index_prompt(resource, response): def index_content(resource, response): - print("INDEXING CONTENT...") + logging.debug('Indexing content for: %s', resource.indexable_url) + doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, @@ -295,35 +303,35 @@ def index_links(from_resource, contained_resources): Link.insert_many(data).execute() +def fetch_robots_file(robot_host): + robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt") + logging.info('Fetching robots file: %s', robot_url) + rp = GeminiRobotFileParser(robot_url) + rp.read() + + def get_robots_file(robot_host): if robot_host not in robot_file_map: - print("Requesting robots.txt for {}".format(robot_host)) - robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt") - rp = GeminiRobotFileParser(robot_url) - rp.read() - robot_file_map[robot_host] = rp + robot_file_map[robot_host] = fetch_robots_file(robot_host) return robot_file_map[robot_host] def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, redirect_chain=[]): gr = gemini_resource + url = gr.fetchable_url if max_crawl_depth >= 0 and current_depth > max_crawl_depth: - print("DEPTH SKIP : %s" % gr.fetchable_url) - print("--------------------------") + logging.warn('Going too deep, skipping: %s', url) return if not gemini_resource.is_valid: - print("INVALID RSCRC: %s" % gr.fetchable_url) - print("--------------------------") + logging.warn('Not a valid gemini resource, skipping: %s', url) return for excluded_prefix in EXCLUDED_URL_PREFIXES: if gr.normalized_url.startswith(excluded_prefix): - print("MANUAL EXCLUSION SKIP : %s" % gr.fetchable_url) - print("--------------------------") + logging.info('URL prefix matches exclusion list, skipping: %s', url) return for excluded_path in EXCLUDED_URL_PATHS: if gr.urlsplit.path.lower().endswith(excluded_path): - print("MANUAL EXCLUSION SKIP : %s" % gr.fetchable_url) - print("--------------------------") + logging.info('URL on exclusion list, skipping: %s', url) return if should_check_if_expired: @@ -331,11 +339,8 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red if existing_page and existing_page.change_frequency is not None: most_recent_crawl = Crawl.select(peewee.fn.MAX(Crawl.timestamp)).where(Crawl.page == existing_page).scalar() if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(hours=existing_page.change_frequency): - print("TOO SOON : %s" % gr.fetchable_url) - print("--------------------------") + logging.debug('Recrawling too soon, skipping: %s', gr.fetchable_url) return - else: - print("Recrawling after %s hours" % existing_page.change_frequency) # ROBOTS robots_file = get_robots_file(gr.normalized_host) @@ -354,8 +359,7 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red crawl_delay = robots_file.crawl_delay("gus") if not can_fetch: - print("ROBOTS SKIP : %s" % gr.fetchable_url) - print("--------------------------") + logging.debug('Blocked by robots files, skipping: %s', url) return # Crawl delay @@ -371,26 +375,24 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red domain_hit_timings[gr.normalized_host] = datetime.now() # Actually fetch! - print("Fetching {}".format(gr.fetchable_url)) + logging.info('Fetching resource: %s', url) if gr.fully_qualified_parent_url is not None: - print("With parent {}".format(gr.fully_qualified_parent_url)) + logging.debug('with parent: %s', gr.fully_qualified_parent_url) response = gr.fetch() if response is None: # problem before getting a response - print("ERROR : %s" % gr.fetchable_url) + logging.warn('Failed to fetch: %s', url) page = index_error(gr, True) page_crawl = Crawl(page=page, status=0, is_different=False, timestamp=datetime.utcnow()) page_crawl.save() - print("--------------------------") + elif response.status.startswith("4"): # temporary error status - print("TEMP ERROR : %s" % response.url) - print("STATUS : %s" % response.status) - print("ERROR MSG : %s" % response.error_message) + logging.debug('Got temporary error: %s: %s %s', url, response.status, response.error_message) page = index_error(gr, True) page_crawl = Crawl(page=page, status=response.status, @@ -398,12 +400,10 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red error_message=response.error_message, timestamp=datetime.utcnow()) page_crawl.save() - print("--------------------------") + elif response.status.startswith("5"): # permanent error status - print("PERM ERROR : %s" % response.url) - print("STATUS : %s" % response.status) - print("ERROR MSG : %s" % response.error_message) + logging.debug('Got permanent error: %s: %s %s', url, response.status, response.error_message) page = index_error(gr, False) page_crawl = Crawl(page=page, status=response.status, @@ -411,39 +411,32 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red error_message=response.error_message, timestamp=datetime.utcnow()) page_crawl.save() - print("--------------------------") + elif response.status.startswith("3"): # redirect status - print("REDIRECT : %s -> %s" % (gr.fetchable_url, response.url)) + logging.debug('Got redirected: %s: %s %s', url, response.status, response.url) if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH: - print("MAXIUMUM REDIRECT CHAIN LENGTH EXCEEDED. ABORTING.") - print("--------------------------") + logging.info('Aborting, maximum redirect chain length reached: %s', url) return redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host) if redirect_resource.fetchable_url == gr.fetchable_url: - print("REDIRECT TO SELF. ABORTING.") - print("--------------------------") + logging.info('Aborting, redirecting to self: %s', url) return page = index_redirect(gr) page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) page_crawl.save() index_links(gr, [redirect_resource]) - print("--------------------------") crawl_page(redirect_resource, current_depth, should_check_if_expired=True, redirect_chain=redirect_chain + [gr.fetchable_url]) + elif response.status.startswith("1"): # input status - print("URL : %s" % response.url) - print("STATUS : %s" % response.status) - print("PROMPT : %s" % response.prompt) + logging.debug('Input requested at: %s: %s %s', url, response.status, response.prompt) page = index_prompt(gr, response) page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) page_crawl.save() - print("--------------------------") elif response.status.startswith("2"): # success status - print("URL : %s" % response.url) - print("STATUS : %s" % response.status) - print("CONTENT TYPE : %s" % response.content_type) + logging.debug('Successful request: %s: %s %s', url, response.status, response.content_type) if response.content_type.startswith("text/"): page, is_different = index_content(gr, response) page_crawl = Crawl( @@ -454,10 +447,9 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red ) page_crawl.save() if response.content_type != "text/gemini": - print("--------------------------") + logging.debug('Content is not gemini text: %s: %s', url, response.content_type) else: - print("Extracting contained resources...") - print("--------------------------") + logging.debug('Got gemini text, extracting and crawling links: %s', url) contained_resources = gr.extract_contained_resources(response.content) index_links(gr, contained_resources) for resource in contained_resources: @@ -466,11 +458,8 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red page = index_binary(gr, response) page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) page_crawl.save() - print("--------------------------") else: - # any other unhandled statuses - print("UNHANDLED : %s" % gr.fetchable_url) - print("--------------------------") + logging.warn('Got unhandled status: %s: %s', url, response.status) def pickle_robot_file_map(robot_file_map, index_dir): @@ -479,7 +468,7 @@ def pickle_robot_file_map(robot_file_map, index_dir): def unpickle_robot_file_map(index_dir): if not os.path.isfile(index_dir + "/robot_file_map.p"): - print("Unable to find robot_file_map.p file to unpickle") + logging.debug('Robot file cache missing') return {} return pickle.load(open(index_dir + "/robot_file_map.p", "rb")) @@ -542,22 +531,22 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE): now = time.time() interval = int(now - last) if interval < 5: - print("Declining to hit {} again after only {} seconds".format(feed_resource.normalized_host, interval)) + logging.warn('Declining to hit %s again after only %d seconds', feed_resource.normalized_host, interval) feed_urls.insert(0, feed_url) skips += 1 if skips == len(feed_urls): # We've hammered every server in the queue! Sleep a bit... - print("Sleeping to give all servers a rest!") + logging.warn('Sleeping to give all servers a rest!') time.sleep(5) continue skips = 0 # Good to go - print("Fetching ", feed_url) + logging.info('Fetching feed: %s', feed_url) try: resp = feed_resource.fetch() except: - print("Error on {}, skipping...".format(feed_url)) + logging.info('Error fetching feed, skipping: %s', feed_url) continue if resp and resp.status == "20": last_accessed[feed_resource.normalized_host] = time.time() @@ -583,8 +572,8 @@ def recrawl_feeds(): crawl_page(resource, 0) pickle_robot_file_map(robot_file_map, index_dir) - print(content_urls) - print("Finished!") + logging.debug('Recrawled feeds: %s', content_urls) + logging.info('Finished!') def run_crawl(should_run_destructive=False, seed_urls=[]): @@ -611,11 +600,13 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): crawl_page(resource, 0, should_check_if_expired=True) pickle_robot_file_map(robot_file_map, index_dir) - print("Finished!") + logging.info('Finished!') def main(): args = parse_args() + gus.lib.logging.handle_arguments(args) + if args.should_recrawl_feeds: recrawl_feeds() else: @@ -649,6 +640,7 @@ def parse_args(): default=[], help="one or more URLs with which to extend the seeds of the crawl", ) + gus.lib.logging.add_arguments(parser) args = parser.parse_args() return args diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -1,3 +1,4 @@ +import logging from datetime import datetime from peewee import fn, SQL @@ -52,31 +53,37 @@ GROUP BY p.normalized_url""") } -def print_index_statistics(index_statistics, crawl_statistics=None): - print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"])) - print("Page Count : {:>6}".format(index_statistics["page_count"])) - print("Domain Count : {:>6}".format(index_statistics["domain_count"])) +def log_index_statistics(index_statistics, crawl_statistics=None): + logging.info('Index generated on: %s', + '{:%Y-%m-%d}'.format(index_statistics['index_modification_time'])) + logging.info('Number of pages indexed: %d', + index_statistics['page_count']) + logging.info('Number of domains indexed: %d', + index_statistics['domain_count']) + if crawl_statistics: - print("Redirect count : {:>6}".format(crawl_statistics["redirect_count"])) - print("Nontrivial redirect count : {:>6}".format(crawl_statistics["redirect_nontrivial_count"])) - print("Broken URL count : {:>6}".format(crawl_statistics["broken_url_count"])) + logging.info('Number of redirects crawled: %d', + crawl_statistics['redirect_count']) + logging.info('Number of redirects crawled (nontrivial): %d', + crawl_statistics['redirect_nontrivial_count']) + logging.info('Number of broken URLs encountered while crawling: %d', + crawl_statistics['broken_url_count']) - print("Domains : {}".format(index_statistics["domains"])) - # for domain in index_statistics["domains"]: - # print("- {}".format(domain)) + logging.debug('Domains indexed: %s', + index_statistics['domains']) - print("\nContent Types:") - for entry in index_statistics["content_type_frequencies"]: - print("{:>5} - {}".format(entry["count"], entry["content_type"])) + for entry in index_statistics['content_type_frequencies']: + logging.info('Number of type "%s" resources indexed: %s', + entry['content_type'], entry['count']) - print("\nCharsets:") - for entry in index_statistics["charset_frequencies"]: - print("{:>5} - {}".format(entry["count"], entry["charset"])) + for entry in index_statistics['charset_frequencies']: + logging.info('Number of type "%s" charsets indexed: %s', + entry['charset'], entry['count']) def run_index_statistics(): index_statistics = compute_index_statistics("index") - print_index_statistics(index_statistics, None) + log_index_statistics(index_statistics, None) # persist_index_statistics(index_statistics, "index-statistics.csv") diff --git a/gus/lib/logging.py b/gus/lib/logging.py @@ -0,0 +1,29 @@ +import os +import sys +import logging.config + + +def add_arguments(parser): + """Add arguments to the given argument argparse parser.""" + + parser.add_argument( + '--logging-config', + '-c', + dest='logging_ini_fname', + default=False, + help='Location of logging configuration file' + ) + + +def handle_arguments(args): + """Take arguments from argparse arguments and configure logging.""" + + if args.logging_ini_fname: + if os.path.isfile(args.logging_ini_fname): + logging.config.fileConfig(args.logging_ini_fname) + else: + sys.exit('Can not find logging ini file: %s' % + args.logging_ini_fname) + + elif os.path.isfile('logging.ini'): + logging.config.fileConfig('logging.ini') diff --git a/logging.ini b/logging.ini @@ -0,0 +1,21 @@ +[loggers] +keys=root + +[handlers] +keys=dev + +[formatters] +keys=plain + +[logger_root] +level=DEBUG +handlers=dev + +[handler_dev] +class=StreamHandler +level=DEBUG +formatter=plain +args=(sys.stdout,) + +[formatter_plain] +format=%(asctime)s %(module)-8s %(levelname)-8s %(message)s