build_index.py (7051B)
1 import argparse 2 import logging 3 4 from datetime import datetime, timedelta 5 from urllib.parse import uses_relative, uses_netloc 6 from peewee import fn 7 8 from . import constants 9 from gus.crawl import should_skip 10 from gus.excludes import EXCLUDED_URL_PREFIXES 11 from gus.lib.db_model import init_db, Page, PageContent 12 from gus.lib.gemini import GeminiResource 13 from gus.lib.index_statistics import ( 14 compute_index_statistics, 15 persist_statistics, 16 log_index_statistics, 17 ) 18 import gus.lib.logging 19 from gus.lib.logging import strip_control_chars 20 import gus.lib.search as search 21 22 # hack: the built-in methods in urllib need to know the 23 # Gemini protocol exists 24 uses_relative.append("gemini") 25 uses_netloc.append("gemini") 26 27 28 def index_page(index, page): 29 if should_skip(GeminiResource(page.url)): 30 logging.debug( 31 "URL is excluded, skipping: %s", 32 strip_control_chars(page.url), 33 ) 34 return False 35 36 logging.info("Indexing page: %s", strip_control_chars(page.url)) 37 38 u = page.url.rstrip("/") 39 external_backlinks = Page.raw( 40 """SELECT p_from.url 41 FROM page AS p_from 42 JOIN link as l ON l.from_page_id == p_from.id 43 JOIN page as p_to ON p_to.id == l.to_page_id 44 WHERE p_to.url == ? 45 AND l.is_cross_host_like == 1""", 46 u 47 ) 48 49 logging.debug("Calculating backlinks for %s", u) 50 backlink_urls = [b.url for b in external_backlinks.execute()] 51 backlink_count = len(backlink_urls) 52 53 document = { 54 "url_id": page.url, 55 "url": page.url, 56 "domain": page.domain, 57 "port": page.port, 58 "content_type": page.content_type, 59 "charset": page.charset or "none", 60 "lang": page.lang, 61 "size": page.size, 62 "indexed_at": datetime.utcnow(), 63 "backlink_count": backlink_count, 64 } 65 66 pagecontent = PageContent.get_or_none(page_id = page.id) 67 if pagecontent is not None: 68 document["prompt"] = pagecontent.prompt 69 document["content"] = pagecontent.content 70 71 try: 72 logging.debug("Adding document to index: %s", page.url); 73 index.add_document(document) 74 logging.debug("Document done") 75 76 return True 77 except Exception as e: 78 logging.exception( 79 "Failed to index page: %s: %s", 80 strip_control_chars(page.url), 81 e 82 ) 83 return False 84 85 86 def build_index(should_run_destructive=False): 87 index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR 88 89 db = init_db(f"{index_dir}/{constants.DB_FILENAME}") 90 index = search.Index(index_dir, should_run_destructive) 91 92 # delete pages that never successfull crawled 93 count=0 94 q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) 95 for page in q.iterator(): 96 try: 97 index.delete_by_term("url_id", page.url) 98 count += page.delete_instance() 99 except Exception as e: 100 logging.error("Failed to delete row %s with outdated successful crawl: %s", page.url, e) 101 logging.warn("Deleted %d rows without successfull crawl", count) 102 103 # delete pages with last crawl success older than 30 days which have been recrawled since than 104 # this avoids deletion of files that have a change_frequency longer than our timeout 105 count=0 106 q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) 107 for page in q.iterator(): 108 try: 109 index.delete_by_term("url_id", page.url) 110 count += page.delete_instance() 111 except Exception as e: 112 logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e) 113 logging.warn("Deleted %d rows with outdated successful crawl", count) 114 115 # delete entire domain that has no page with a recent successfull crawl 116 last_valid_timestamp = datetime.now() - timedelta(days = 30) 117 outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at.is_null(False) & Page.last_crawl_success_at.is_null(False)).group_by(Page.domain) 118 domains = outdated_domains_query.execute() 119 for del_domain in domains: 120 try: 121 if (del_domain.last_crawl_success_at < last_valid_timestamp and del_domain.last_crawl_at > del_domain.last_crawl_success_at): 122 logging.warn("Deleting pages for domain: %s, last crawl: %s, last crawl success: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at) 123 outdated_pages_query = Page.select(Page.url).where(Page.domain == del_domain.domain) 124 for outdated_page in outdated_pages_query.iterator(): 125 # we need to delete every single page as "delete_by_term" does not work on 126 # fields that parsed by a stemmer like "domain" as a text field is 127 index.delete_by_term("url_id", outdated_page.url) 128 outdated_page.delete_instance() 129 except Exception as e: 130 logging.error("Failed to delete domain %s (last crawl: %s, last crawl success: %s) with outdated successful crawl: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at, e) 131 132 if (should_run_destructive): 133 pages = Page.raw( 134 """SELECT p.* FROM page AS p 135 WHERE p.last_success_status == 20 136 AND (p.content_type NOT LIKE 'text/%' 137 OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE 138 ) 139 else: 140 pages = Page.raw( 141 """SELECT p.* FROM page AS p 142 WHERE p.last_success_status == 20 143 AND (p.indexed_at IS NULL OR 144 p.indexed_at < p.last_crawl_success_at) 145 AND (p.content_type NOT LIKE 'text/%' 146 OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE 147 ) 148 149 for page in pages.__iter__(): 150 index_page(index, page) 151 page.indexed_at = datetime.utcnow() 152 page.save() 153 154 try: 155 logging.info("Commiting search index...") 156 index.close() 157 except Exception as e: 158 logging.error('Closing of index failed: %s', e); 159 160 logging.debug("Updating statistics...") 161 index_statistics = compute_index_statistics(db) 162 log_index_statistics(index_statistics) 163 persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv") 164 165 logging.info("Finished!") 166 167 168 def main(): 169 args = parse_args() 170 gus.lib.logging.handle_arguments(args) 171 build_index(args.should_run_destructive) 172 173 174 def parse_args(): 175 parser = argparse.ArgumentParser(description="Crawl Geminispace.") 176 parser.add_argument( 177 "--destructive", 178 "-d", 179 dest="should_run_destructive", 180 action="store_true", 181 default=False, 182 help="create a fresh index", 183 ) 184 gus.lib.logging.add_arguments(parser) 185 args = parser.parse_args() 186 return args 187 188 189 if __name__ == "__main__": 190 main()