index_statistics.py - geminispace.info

index_statistics.py (5575B)
      1 import logging
      2 from datetime import datetime
      3 
      4 from peewee import fn, SQL
      5 
      6 from gus.excludes import EXCLUDED_URL_PREFIXES
      7 from gus.lib.db_model import Page
      8 
      9 
     10 def compute_index_statistics(db):
     11     page_count = len(Page.raw("""SELECT DISTINCT p.id
     12 FROM page AS p
     13 WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""").dicts())
     14 
     15     domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port 
     16 FROM page AS p 
     17 WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""")
     18     domains = []
     19     for d in domains_query.execute():
     20         s = d.domain
     21         if d.port != 1965:
     22             s = f"{d.domain}:{d.port}"
     23         should_skip = False
     24         for excluded_prefix in EXCLUDED_URL_PREFIXES:
     25             if f"gemini://{s}".startswith(excluded_prefix):
     26                 should_skip = True
     27                 break
     28         if should_skip:
     29             continue
     30         domains.append(s)
     31     domain_count = len(domains)
     32 
     33     content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count'
     34 FROM  page AS p
     35 WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20
     36 GROUP BY p.content_type
     37 ORDER BY 2 desc""").dicts())
     38     charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count'
     39 FROM page AS p
     40 WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20 AND  p.charset IS NOT NULL
     41 GROUP BY upper(p.charset)
     42 ORDER BY 2 desc""").dicts())
     43     index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar()
     44 
     45     return {
     46         "index_modification_time": index_modification_time,
     47         "page_count": page_count,
     48         "domain_count": domain_count,
     49         "content_type_frequencies": content_type_frequencies,
     50         "charset_frequencies": charset_frequencies,
     51         "domains": "",
     52     }
     53 
     54 
     55 def log_index_statistics(index_statistics, crawl_statistics=None):
     56     logging.info('Index generated on: %s',
     57                   '{:%Y-%m-%d}'.format(index_statistics['index_modification_time']))
     58     logging.info('Number of pages indexed: %d',
     59                  index_statistics['page_count'])
     60     logging.info('Number of domains indexed: %d',
     61                  index_statistics['domain_count'])
     62 
     63     if crawl_statistics:
     64         logging.info('Number of redirects crawled: %d',
     65                      crawl_statistics['redirect_count'])
     66         logging.info('Number of redirects crawled (nontrivial): %d',
     67                      crawl_statistics['redirect_nontrivial_count'])
     68         logging.info('Number of broken URLs encountered while crawling: %d',
     69                      crawl_statistics['broken_url_count'])
     70 
     71     for entry in index_statistics['content_type_frequencies']:
     72         logging.info('Number of type "%s" resources indexed: %s',
     73                      entry['content_type'], entry['count'])
     74 
     75     for entry in index_statistics['charset_frequencies']:
     76         logging.info('Number of type "%s" charsets indexed: %s',
     77                      entry['charset'], entry['count'])
     78 
     79 
     80 def persist_statistics(index_statistics, crawl_statistics, was_destructive, filename):
     81     with open(filename, "a") as f:
     82         f.write(serialize_statistics_line(index_statistics, crawl_statistics, was_destructive))
     83 
     84 
     85 def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive):
     86     return "{:%Y-%m-%d},{},{},{},{},{},{},{},{},{}\n".format(
     87         index_statistics["index_modification_time"],
     88         was_destructive,
     89         index_statistics["page_count"],
     90         index_statistics["domain_count"],
     91         crawl_statistics["redirect_count"] if crawl_statistics else 0,
     92         crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0,
     93         crawl_statistics["broken_url_count"] if crawl_statistics else 0,
     94         "", 
     95         "|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]),
     96         "|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]),
     97     )
     98 
     99 
    100 def load_last_statistics_from_file(filename):
    101     with open(filename) as f:
    102         data = f.readlines()
    103     lastline = data[-1].strip()
    104     statistics = deserialize_statistics_line(lastline)
    105     return statistics
    106 
    107 
    108 def load_all_statistics_from_file(filename):
    109     with open(filename) as f:
    110         data = f.readlines()
    111     return [{
    112         "date": datetime.strptime(line.split(",")[0], "%Y-%m-%d"),
    113         "page_count": line.split(",")[2],
    114         "domain_count": line.split(",")[3],
    115     } for line in data[1:]]
    116 
    117 
    118 def deserialize_statistics_line(line):
    119     line_parts = line.split(",")
    120     index_modification_time = datetime.strptime(line_parts[0], "%Y-%m-%d")
    121     # discard line_parts[1], which is `was_destructive`
    122     page_count = line_parts[2]
    123     domain_count = line_parts[3]
    124     redirect_count = line_parts[4]
    125     redirect_nontrivial_count = line_parts[5]
    126     broken_url_count = line_parts[6]
    127     content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")]
    128     charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")]
    129 
    130     return {
    131         "index_modification_time": index_modification_time,
    132         "page_count": page_count,
    133         "domain_count": domain_count,
    134         "redirect_count": redirect_count,
    135         "redirect_nontrivial_count": redirect_nontrivial_count,
    136         "broken_url_count": broken_url_count,
    137         "domains": "",
    138         "content_type_frequencies": content_type_frequencies,
    139         "charset_frequencies": charset_frequencies,
    140     }
	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE