gemini.py (13950B)
1 import re 2 from urllib.parse import ( 3 quote, 4 unquote, 5 urljoin, 6 urlparse, 7 urlsplit, 8 urlunparse, 9 urlunsplit, 10 uses_relative, 11 uses_netloc, 12 ) 13 from urllib.robotparser import RobotFileParser 14 15 import gusmobile 16 from gus import constants 17 from gus.lib.domain import is_domain 18 19 # hack: the built-in methods in urllib need to know the 20 # Gemini protocol exists 21 uses_relative.append("gemini") 22 uses_netloc.append("gemini") 23 24 LOG_ROOT_LIKE_PATTERN = re.compile( 25 r".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", 26 flags=re.IGNORECASE, 27 ) 28 LOG_POST_LIKE_PATTERN = re.compile( 29 r".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", 30 flags=re.IGNORECASE, 31 ) 32 LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile( 33 r".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$", 34 flags=re.IGNORECASE, 35 ) 36 LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile( 37 r"^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE 38 ) 39 LOG_POST_BOSTON_LIKE_PATTERN = re.compile( 40 r"^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE 41 ) 42 43 ROOT_LIKE_ONLY_PATTERN = re.compile( 44 r"^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE 45 ) 46 ROOT_LIKE_PATTERN = re.compile( 47 r"^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE 48 ) 49 50 AUTHOR_URL_PATTERN = re.compile( 51 r"^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE 52 ) 53 AUTHOR_CONTENT_PATTERN = re.compile( 54 r".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE 55 ) 56 57 TITLE_CONTENT_PATTERN = re.compile(r"^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE) 58 TITLE_URL_PATTERN = re.compile( 59 r".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", 60 flags=re.IGNORECASE, 61 ) 62 63 64 class GeminiRobotFileParser(RobotFileParser): 65 def set_url(self, url): 66 """Sets the URL referring to a robots.txt file.""" 67 self.url = url 68 u, _ = GeminiResource.urlsplit_featureful(url) 69 self.host, self.path = u[1:3] 70 71 def read(self): 72 """Reads the robots.txt URL and feeds it to the parser.""" 73 gr = GeminiResource(self.url) 74 response = gr.fetch() 75 if response is None: 76 self.allow_all = True 77 return 78 if not response.status.startswith("2") or not response.content_type.startswith("text/"): 79 self.allow_all = True 80 else: 81 self.parse(response.content.splitlines()) 82 83 def read_from_string(self, robots_txt): 84 """An utility method for writing tests""" 85 self.parse(robots_txt.splitlines()) 86 87 def can_fetch_prioritized(self, useragents, url): 88 """Given a url and prioritized list of user-agents, is fetching allowed? 89 90 Priority is with the highest priority first; eg. ["ThisIndexerBot", "generic-indexer", "generic-bot", "*"]. 91 """ 92 if self.allow_all: 93 return True 94 if self.disallow_all: 95 return False 96 97 if not self.last_checked: 98 return False 99 100 parsed_url = urlparse(unquote(url)) 101 url = urlunparse(('','',parsed_url.path, parsed_url.params,parsed_url.query, parsed_url.fragment)) 102 url = quote(url) or "/" 103 104 def useragent_allowed(useragent): 105 for entry in self.entries: 106 if entry.applies_to(useragent): 107 return entry.allowance(url) 108 return None 109 110 # map user-agents to allowances; the first non-None will be the prioritized allowance 111 for ua in useragents: 112 allowed = useragent_allowed(ua) 113 if allowed is not None: 114 return allowed 115 116 # if none of the user-agents match, check default entry 117 if self.default_entry: 118 return self.default_entry.allowance(url) 119 120 # if nothing matches, crawling is allowed 121 return True 122 123 class GeminiResource: 124 def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None): 125 self.raw_url = url 126 self.urlsplit, self.is_relative = GeminiResource.urlsplit_featureful( 127 url, 128 fully_qualified_parent_url=fully_qualified_parent_url, 129 parent_hostname=parent_hostname, 130 ) 131 self.is_valid = self.urlsplit is not None 132 self.fully_qualified_parent_url = fully_qualified_parent_url 133 self._normalized_host = None 134 self._normalized_host_like = None 135 self._fetchable_url = None 136 self._is_root_like = None 137 self._is_log_root_like = None 138 self._is_log_post_like = None 139 self._default_change_frequency = None 140 self.contained_resources = None 141 142 def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None): 143 # the point of this relatively complex function is to allow for protocol-less, 144 # double-slash-prepended-less URLs that still get treated as absolute (i.e., 145 # non-relative) URLs and thus get their hosts parsed correctly by `urlsplit`. 146 # This is important because I want to be able to use the host for a number of 147 # things behind the scenes. 148 149 is_relative = False 150 u = urlsplit(url, "gemini") 151 if u.scheme != "gemini": 152 return None, None 153 if u.hostname is None: 154 if url.startswith("/"): 155 # process relative link 156 if parent_hostname is None: 157 return None, None 158 joined = urljoin("gemini://{}".format(parent_hostname), url) 159 u = urlsplit(joined, "gemini") 160 is_relative = True 161 else: # url does not start with / 162 # could be: blah.com/test 163 # could be: test 164 url_split = url.split("/") 165 if is_domain(url_split[0]): 166 # treat schemeless uris as non-gemini as announced in 167 # https://lists.orbitalfox.eu/archives/gemini/2020/003646.html 168 return None, None 169 else: 170 # process relative link 171 if fully_qualified_parent_url is None: 172 return None, None 173 joined = urljoin(fully_qualified_parent_url, url) 174 u = urlsplit(joined, "gemini") 175 is_relative = True 176 return u, is_relative 177 178 def _get_normalized_host(self): 179 if not self.is_valid: 180 return None 181 if self._normalized_host is None: 182 self._normalized_host = self.urlsplit.hostname 183 return self._normalized_host 184 185 def _get_normalized_host_like(self): 186 if not self.is_valid: 187 return None 188 if self._normalized_host_like is None: 189 normalized_host_like = self.normalized_host 190 m = ROOT_LIKE_PATTERN.match(self.urlsplit.path) 191 if m: 192 normalized_host_like += m[0].rstrip("/") 193 self._normalized_host_like = normalized_host_like 194 return self._normalized_host_like 195 196 def _get_fetchable_url(self): 197 if not self.is_valid: 198 return None 199 if self._fetchable_url is None: 200 # we deliberately do not work with the fragment part 201 self._fetchable_url = "{}://{}{}{}{}".format( 202 self.urlsplit.scheme, 203 self.urlsplit.hostname, 204 "" if self.urlsplit.port is None or self.urlsplit.port == 1965 else ":{}".format(self.urlsplit.port), 205 "/" if self.urlsplit.path == "" else self.urlsplit.path, 206 "" if self.urlsplit.query == "" else "?{}".format(self.urlsplit.query)) 207 return self._fetchable_url 208 209 def _get_is_root_like(self): 210 if self._is_root_like is None: 211 is_root_like = False 212 if ( 213 self.urlsplit.path == "" 214 or self.urlsplit.path == "/" 215 or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path) 216 ): 217 is_root_like = True 218 self._is_root_like = is_root_like 219 return self._is_root_like 220 221 def _get_is_log_root_like(self): 222 if self._is_log_root_like is None: 223 is_log_root_like = False 224 if ( 225 self.urlsplit.path == "" 226 or self.urlsplit.path == "/" 227 or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path) 228 ): 229 is_log_root_like = True 230 self._is_log_root_like = is_log_root_like 231 return self._is_log_root_like 232 233 def _get_is_log_post_like(self): 234 if self._is_log_post_like is None: 235 is_log_post_like = False 236 post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path) 237 post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match( 238 self.urlsplit.path 239 ) 240 post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match( 241 self.urlsplit.path 242 ) 243 post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path) 244 245 if ( 246 (post_like_match and not post_like_exclusion_match) 247 or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) 248 or (self.normalized_host == "gemini.conman.org" and post_boston_match) 249 ): 250 is_log_post_like = True 251 self._is_log_post_like = is_log_post_like 252 return self._is_log_post_like 253 254 def get_default_change_frequency(self, category): 255 if not self.is_valid: 256 return None 257 if self._default_change_frequency is None: 258 if category == "content": 259 if self.is_root_like or self.is_log_root_like: 260 change_frequency = constants.ROOT_CHANGE_FREQUENCY_DEFAULT 261 else: 262 change_frequency = constants.NON_ROOT_CHANGE_FREQUENCY_DEFAULT 263 elif category == "binary": 264 change_frequency = constants.BINARY_CHANGE_FREQUENCY_DEFAULT 265 elif category == "redirect": 266 change_frequency = constants.REDIRECT_CHANGE_FREQUENCY_DEFAULT 267 elif category == "temp_error": 268 change_frequency = constants.TEMP_ERROR_CHANGE_FREQUENCY_DEFAULT 269 elif category == "perm_error": 270 change_frequency = constants.PERM_ERROR_CHANGE_FREQUENCY_DEFAULT 271 elif category == "prompt": 272 change_frequency = constants.PROMPT_CHANGE_FREQUENCY_DEFAULT 273 else: 274 raise Exception.NameError("Unrecognized resource category") 275 276 self._default_change_frequency = change_frequency 277 return self._default_change_frequency 278 279 def increment_change_frequency(self, existing_change_frequency, category): 280 if category == "content": 281 if self.is_root_like or self.is_log_root_like: 282 return existing_change_frequency + constants.ROOT_CHANGE_FREQUENCY_INCREMENT 283 else: 284 return existing_change_frequency + constants.NON_ROOT_CHANGE_FREQUENCY_INCREMENT 285 elif category == "binary": 286 return existing_change_frequency + constants.BINARY_CHANGE_FREQUENCY_INCREMENT 287 elif category == "redirect": 288 return existing_change_frequency + constants.REDIRECT_CHANGE_FREQUENCY_INCREMENT 289 elif category == "temp_error": 290 return existing_change_frequency + constants.TEMP_ERROR_CHANGE_FREQUENCY_INCREMENT 291 elif category == "perm_error": 292 return existing_change_frequency + constants.PERM_ERROR_CHANGE_FREQUENCY_INCREMENT 293 elif category == "prompt": 294 return existing_change_frequency + constants.PROMPT_CHANGE_FREQUENCY_INCREMENT 295 else: 296 raise Exception.NameError("Unrecognized resource category") 297 298 299 def fetch(self): 300 # NB: this intentionally does NOT fetch the normalized URL, because that could 301 # cause an infinite loop with, e.g., normalization stripping a trailing slash 302 # and a server redirecting to the same URL _with_ a trailing slash. 303 return gusmobile.fetch(self.fetchable_url) 304 305 306 def extract_contained_resources(self, content): 307 # this finds all gemini URLs within the content of a given GeminiResource and 308 # returns them as a list of new GeminiResources 309 if self.contained_resources: 310 return self.contained_resources 311 312 link_pattern = r"^=>\s*(\S+)" 313 preformat_pattern = r"^```.*?^```" 314 content_without_preformat = re.sub( 315 preformat_pattern, "", content, flags=re.DOTALL | re.MULTILINE 316 ) 317 probable_urls = re.findall( 318 link_pattern, content_without_preformat, re.MULTILINE 319 ) 320 resources = [] 321 for url in probable_urls: 322 resource = GeminiResource( 323 url, 324 fully_qualified_parent_url=self.fetchable_url, 325 parent_hostname=self.urlsplit.hostname.lower(), 326 ) 327 if resource.is_valid: 328 resources.append(resource) 329 self.contained_resources = resources 330 331 return self.contained_resources 332 333 # constructed from fetchable_url 334 # does not matter if quoted or unquoted so I choose arbitrarily to 335 # standardize on unquoting it. 336 normalized_host = property(_get_normalized_host) 337 # constructed from urlsplit, should be quoted. 338 fetchable_url = property(_get_fetchable_url) 339 # constructed from fetchable_url, should be unquoted. 340 is_root_like = property(_get_is_root_like) 341 is_log_root_like = property(_get_is_log_root_like) 342 is_log_post_like = property(_get_is_log_post_like) 343 # pubnix-aware host version, means that the user-specific dir is appended 344 normalized_host_like = property(_get_normalized_host_like) 345