build_index.py - geminispace.info - gemini search engine

build_index.py (7051B)
      1 import argparse
      2 import logging
      3 
      4 from datetime import datetime, timedelta
      5 from urllib.parse import uses_relative, uses_netloc
      6 from peewee import fn
      7 
      8 from . import constants
      9 from gus.crawl import should_skip
     10 from gus.excludes import EXCLUDED_URL_PREFIXES 
     11 from gus.lib.db_model import init_db, Page, PageContent
     12 from gus.lib.gemini import GeminiResource
     13 from gus.lib.index_statistics import (
     14     compute_index_statistics,
     15     persist_statistics,
     16     log_index_statistics,
     17 )
     18 import gus.lib.logging
     19 from gus.lib.logging import strip_control_chars
     20 import gus.lib.search as search
     21 
     22 # hack: the built-in methods in urllib need to know the
     23 # Gemini protocol exists
     24 uses_relative.append("gemini")
     25 uses_netloc.append("gemini")
     26 
     27 
     28 def index_page(index, page):
     29     if should_skip(GeminiResource(page.url)):
     30         logging.debug(
     31             "URL is excluded, skipping: %s",
     32             strip_control_chars(page.url),
     33         )
     34         return False
     35 
     36     logging.info("Indexing page: %s", strip_control_chars(page.url))
     37 
     38     u = page.url.rstrip("/")
     39     external_backlinks = Page.raw(
     40         """SELECT p_from.url
     41 FROM page AS p_from
     42 JOIN link as l ON l.from_page_id == p_from.id
     43 JOIN page as p_to ON p_to.id == l.to_page_id
     44 WHERE p_to.url == ?
     45 AND l.is_cross_host_like == 1""",
     46         u
     47     )
     48 
     49     logging.debug("Calculating backlinks for %s", u)
     50     backlink_urls = [b.url for b in external_backlinks.execute()]
     51     backlink_count = len(backlink_urls)
     52 
     53     document = {
     54         "url_id": page.url,
     55         "url": page.url,
     56         "domain": page.domain,
     57         "port": page.port,
     58         "content_type": page.content_type,
     59         "charset": page.charset or "none",
     60         "lang": page.lang,
     61         "size": page.size,
     62         "indexed_at": datetime.utcnow(),
     63         "backlink_count": backlink_count,
     64     }
     65 
     66     pagecontent = PageContent.get_or_none(page_id = page.id)
     67     if pagecontent is not None:
     68         document["prompt"] = pagecontent.prompt
     69         document["content"] = pagecontent.content
     70 
     71     try:
     72         logging.debug("Adding document to index: %s", page.url);
     73         index.add_document(document)
     74         logging.debug("Document done") 
     75         
     76         return True
     77     except Exception as e:
     78         logging.exception(
     79             "Failed to index page: %s: %s",
     80             strip_control_chars(page.url),
     81             e
     82         )
     83         return False
     84 
     85 
     86 def build_index(should_run_destructive=False):
     87     index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
     88 
     89     db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
     90     index = search.Index(index_dir, should_run_destructive)
     91 
     92     # delete pages that never successfull crawled
     93     count=0
     94     q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False))
     95     for page in q.iterator():
     96         try:
     97             index.delete_by_term("url_id", page.url)
     98             count += page.delete_instance()
     99         except Exception as e:
    100             logging.error("Failed to delete row %s with outdated successful crawl: %s", page.url, e)
    101     logging.warn("Deleted %d rows without successfull crawl", count)
    102 
    103     # delete pages with last crawl success older than 30 days which have been recrawled since than
    104     # this avoids deletion of files that have a change_frequency longer than our timeout
    105     count=0
    106     q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
    107     for page in q.iterator():
    108         try:
    109             index.delete_by_term("url_id", page.url)
    110             count += page.delete_instance()
    111         except Exception as e:
    112             logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e)
    113     logging.warn("Deleted %d rows with outdated successful crawl", count)
    114 
    115     # delete entire domain that has no page with a recent successfull crawl
    116     last_valid_timestamp = datetime.now() - timedelta(days = 30)
    117     outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at.is_null(False) & Page.last_crawl_success_at.is_null(False)).group_by(Page.domain)
    118     domains = outdated_domains_query.execute()
    119     for del_domain in domains:
    120         try:
    121             if (del_domain.last_crawl_success_at < last_valid_timestamp and del_domain.last_crawl_at > del_domain.last_crawl_success_at):
    122                 logging.warn("Deleting pages for domain: %s, last crawl: %s, last crawl success: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at)
    123                 outdated_pages_query = Page.select(Page.url).where(Page.domain == del_domain.domain)
    124                 for outdated_page in outdated_pages_query.iterator():
    125                     # we need to delete every single page as "delete_by_term" does not work on
    126                     # fields that parsed by a stemmer like "domain" as a text field is
    127                     index.delete_by_term("url_id", outdated_page.url)
    128                     outdated_page.delete_instance()
    129         except Exception as e:
    130             logging.error("Failed to delete domain %s (last crawl: %s, last crawl success: %s) with outdated successful crawl: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at, e)
    131 
    132     if (should_run_destructive):
    133         pages = Page.raw(
    134         """SELECT p.* FROM page AS p
    135 WHERE p.last_success_status == 20 
    136 AND (p.content_type NOT LIKE 'text/%'
    137 OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
    138     )
    139     else:
    140         pages = Page.raw(
    141         """SELECT p.* FROM page AS p
    142 WHERE p.last_success_status == 20 
    143 AND (p.indexed_at IS NULL OR 
    144 p.indexed_at < p.last_crawl_success_at)
    145 AND (p.content_type NOT LIKE 'text/%'
    146 OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
    147     )
    148 
    149     for page in pages.__iter__():
    150         index_page(index, page)
    151         page.indexed_at = datetime.utcnow()
    152         page.save()
    153 
    154     try:
    155         logging.info("Commiting search index...")
    156         index.close()
    157     except Exception as e:
    158         logging.error('Closing of index failed: %s', e);
    159  
    160     logging.debug("Updating statistics...")
    161     index_statistics = compute_index_statistics(db)
    162     log_index_statistics(index_statistics)
    163     persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv")
    164 
    165     logging.info("Finished!")
    166 
    167 
    168 def main():
    169     args = parse_args()
    170     gus.lib.logging.handle_arguments(args)
    171     build_index(args.should_run_destructive)
    172 
    173 
    174 def parse_args():
    175     parser = argparse.ArgumentParser(description="Crawl Geminispace.")
    176     parser.add_argument(
    177         "--destructive",
    178         "-d",
    179         dest="should_run_destructive",
    180         action="store_true",
    181         default=False,
    182         help="create a fresh index",
    183     )
    184     gus.lib.logging.add_arguments(parser)
    185     args = parser.parse_args()
    186     return args
    187 
    188 
    189 if __name__ == "__main__":
    190     main()
	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE