index_statistics.py (5575B)
1 import logging 2 from datetime import datetime 3 4 from peewee import fn, SQL 5 6 from gus.excludes import EXCLUDED_URL_PREFIXES 7 from gus.lib.db_model import Page 8 9 10 def compute_index_statistics(db): 11 page_count = len(Page.raw("""SELECT DISTINCT p.id 12 FROM page AS p 13 WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""").dicts()) 14 15 domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port 16 FROM page AS p 17 WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""") 18 domains = [] 19 for d in domains_query.execute(): 20 s = d.domain 21 if d.port != 1965: 22 s = f"{d.domain}:{d.port}" 23 should_skip = False 24 for excluded_prefix in EXCLUDED_URL_PREFIXES: 25 if f"gemini://{s}".startswith(excluded_prefix): 26 should_skip = True 27 break 28 if should_skip: 29 continue 30 domains.append(s) 31 domain_count = len(domains) 32 33 content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count' 34 FROM page AS p 35 WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20 36 GROUP BY p.content_type 37 ORDER BY 2 desc""").dicts()) 38 charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count' 39 FROM page AS p 40 WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20 AND p.charset IS NOT NULL 41 GROUP BY upper(p.charset) 42 ORDER BY 2 desc""").dicts()) 43 index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar() 44 45 return { 46 "index_modification_time": index_modification_time, 47 "page_count": page_count, 48 "domain_count": domain_count, 49 "content_type_frequencies": content_type_frequencies, 50 "charset_frequencies": charset_frequencies, 51 "domains": "", 52 } 53 54 55 def log_index_statistics(index_statistics, crawl_statistics=None): 56 logging.info('Index generated on: %s', 57 '{:%Y-%m-%d}'.format(index_statistics['index_modification_time'])) 58 logging.info('Number of pages indexed: %d', 59 index_statistics['page_count']) 60 logging.info('Number of domains indexed: %d', 61 index_statistics['domain_count']) 62 63 if crawl_statistics: 64 logging.info('Number of redirects crawled: %d', 65 crawl_statistics['redirect_count']) 66 logging.info('Number of redirects crawled (nontrivial): %d', 67 crawl_statistics['redirect_nontrivial_count']) 68 logging.info('Number of broken URLs encountered while crawling: %d', 69 crawl_statistics['broken_url_count']) 70 71 for entry in index_statistics['content_type_frequencies']: 72 logging.info('Number of type "%s" resources indexed: %s', 73 entry['content_type'], entry['count']) 74 75 for entry in index_statistics['charset_frequencies']: 76 logging.info('Number of type "%s" charsets indexed: %s', 77 entry['charset'], entry['count']) 78 79 80 def persist_statistics(index_statistics, crawl_statistics, was_destructive, filename): 81 with open(filename, "a") as f: 82 f.write(serialize_statistics_line(index_statistics, crawl_statistics, was_destructive)) 83 84 85 def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive): 86 return "{:%Y-%m-%d},{},{},{},{},{},{},{},{},{}\n".format( 87 index_statistics["index_modification_time"], 88 was_destructive, 89 index_statistics["page_count"], 90 index_statistics["domain_count"], 91 crawl_statistics["redirect_count"] if crawl_statistics else 0, 92 crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0, 93 crawl_statistics["broken_url_count"] if crawl_statistics else 0, 94 "", 95 "|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]), 96 "|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]), 97 ) 98 99 100 def load_last_statistics_from_file(filename): 101 with open(filename) as f: 102 data = f.readlines() 103 lastline = data[-1].strip() 104 statistics = deserialize_statistics_line(lastline) 105 return statistics 106 107 108 def load_all_statistics_from_file(filename): 109 with open(filename) as f: 110 data = f.readlines() 111 return [{ 112 "date": datetime.strptime(line.split(",")[0], "%Y-%m-%d"), 113 "page_count": line.split(",")[2], 114 "domain_count": line.split(",")[3], 115 } for line in data[1:]] 116 117 118 def deserialize_statistics_line(line): 119 line_parts = line.split(",") 120 index_modification_time = datetime.strptime(line_parts[0], "%Y-%m-%d") 121 # discard line_parts[1], which is `was_destructive` 122 page_count = line_parts[2] 123 domain_count = line_parts[3] 124 redirect_count = line_parts[4] 125 redirect_nontrivial_count = line_parts[5] 126 broken_url_count = line_parts[6] 127 content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")] 128 charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")] 129 130 return { 131 "index_modification_time": index_modification_time, 132 "page_count": page_count, 133 "domain_count": domain_count, 134 "redirect_count": redirect_count, 135 "redirect_nontrivial_count": redirect_nontrivial_count, 136 "broken_url_count": broken_url_count, 137 "domains": "", 138 "content_type_frequencies": content_type_frequencies, 139 "charset_frequencies": charset_frequencies, 140 }