geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

gemini.py (13950B)


      1 import re
      2 from urllib.parse import (
      3     quote,
      4     unquote,
      5     urljoin,
      6     urlparse,
      7     urlsplit,
      8     urlunparse,
      9     urlunsplit,
     10     uses_relative,
     11     uses_netloc,
     12 )
     13 from urllib.robotparser import RobotFileParser
     14 
     15 import gusmobile
     16 from gus import constants
     17 from gus.lib.domain import is_domain
     18 
     19 # hack: the built-in methods in urllib need to know the
     20 # Gemini protocol exists
     21 uses_relative.append("gemini")
     22 uses_netloc.append("gemini")
     23 
     24 LOG_ROOT_LIKE_PATTERN = re.compile(
     25     r".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$",
     26     flags=re.IGNORECASE,
     27 )
     28 LOG_POST_LIKE_PATTERN = re.compile(
     29     r".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)",
     30     flags=re.IGNORECASE,
     31 )
     32 LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(
     33     r".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$",
     34     flags=re.IGNORECASE,
     35 )
     36 LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile(
     37     r"^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE
     38 )
     39 LOG_POST_BOSTON_LIKE_PATTERN = re.compile(
     40     r"^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE
     41 )
     42 
     43 ROOT_LIKE_ONLY_PATTERN = re.compile(
     44     r"^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE
     45 )
     46 ROOT_LIKE_PATTERN = re.compile(
     47     r"^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE
     48 )
     49 
     50 AUTHOR_URL_PATTERN = re.compile(
     51     r"^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE
     52 )
     53 AUTHOR_CONTENT_PATTERN = re.compile(
     54     r".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE
     55 )
     56 
     57 TITLE_CONTENT_PATTERN = re.compile(r"^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE)
     58 TITLE_URL_PATTERN = re.compile(
     59     r".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$",
     60     flags=re.IGNORECASE,
     61 )
     62 
     63 
     64 class GeminiRobotFileParser(RobotFileParser):
     65     def set_url(self, url):
     66         """Sets the URL referring to a robots.txt file."""
     67         self.url = url
     68         u, _ = GeminiResource.urlsplit_featureful(url)
     69         self.host, self.path = u[1:3]
     70 
     71     def read(self):
     72         """Reads the robots.txt URL and feeds it to the parser."""
     73         gr = GeminiResource(self.url)
     74         response = gr.fetch()
     75         if response is None:
     76             self.allow_all = True
     77             return
     78         if not response.status.startswith("2") or not response.content_type.startswith("text/"):
     79             self.allow_all = True
     80         else:
     81             self.parse(response.content.splitlines())
     82 
     83     def read_from_string(self, robots_txt):
     84         """An utility method for writing tests"""
     85         self.parse(robots_txt.splitlines())
     86 
     87     def can_fetch_prioritized(self, useragents, url):
     88         """Given a url and prioritized list of user-agents, is fetching allowed?
     89 
     90         Priority is with the highest priority first; eg. ["ThisIndexerBot", "generic-indexer", "generic-bot", "*"].
     91         """
     92         if self.allow_all:
     93             return True
     94         if self.disallow_all:
     95             return False
     96 
     97         if not self.last_checked:
     98             return False
     99 
    100         parsed_url = urlparse(unquote(url))
    101         url = urlunparse(('','',parsed_url.path, parsed_url.params,parsed_url.query, parsed_url.fragment))
    102         url = quote(url) or "/"
    103 
    104         def useragent_allowed(useragent):
    105             for entry in self.entries:
    106                 if entry.applies_to(useragent):
    107                     return entry.allowance(url)
    108             return None
    109 
    110         # map user-agents to allowances; the first non-None will be the prioritized allowance
    111         for ua in useragents:
    112             allowed = useragent_allowed(ua)
    113             if allowed is not None:
    114                 return allowed
    115 
    116         # if none of the user-agents match, check default entry
    117         if self.default_entry:
    118             return self.default_entry.allowance(url)
    119 
    120         # if nothing matches, crawling is allowed
    121         return True
    122 
    123 class GeminiResource:
    124     def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None):
    125         self.raw_url = url
    126         self.urlsplit, self.is_relative = GeminiResource.urlsplit_featureful(
    127             url,
    128             fully_qualified_parent_url=fully_qualified_parent_url,
    129             parent_hostname=parent_hostname,
    130         )
    131         self.is_valid = self.urlsplit is not None
    132         self.fully_qualified_parent_url = fully_qualified_parent_url
    133         self._normalized_host = None
    134         self._normalized_host_like = None
    135         self._fetchable_url = None
    136         self._is_root_like = None
    137         self._is_log_root_like = None
    138         self._is_log_post_like = None
    139         self._default_change_frequency = None
    140         self.contained_resources = None
    141 
    142     def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None):
    143         # the point of this relatively complex function is to allow for protocol-less,
    144         # double-slash-prepended-less URLs that still get treated as absolute (i.e.,
    145         # non-relative) URLs and thus get their hosts parsed correctly by `urlsplit`.
    146         # This is important because I want to be able to use the host for a number of
    147         # things behind the scenes.
    148 
    149         is_relative = False
    150         u = urlsplit(url, "gemini")
    151         if u.scheme != "gemini":
    152             return None, None
    153         if u.hostname is None:
    154             if url.startswith("/"):
    155                 # process relative link
    156                 if parent_hostname is None:
    157                     return None, None
    158                 joined = urljoin("gemini://{}".format(parent_hostname), url)
    159                 u = urlsplit(joined, "gemini")
    160                 is_relative = True
    161             else:  # url does not start with /
    162                 # could be: blah.com/test
    163                 # could be: test
    164                 url_split = url.split("/")
    165                 if is_domain(url_split[0]):
    166                     # treat schemeless uris as non-gemini as announced in
    167                     # https://lists.orbitalfox.eu/archives/gemini/2020/003646.html
    168                     return None, None
    169                 else:
    170                     # process relative link
    171                     if fully_qualified_parent_url is None:
    172                         return None, None
    173                     joined = urljoin(fully_qualified_parent_url, url)
    174                     u = urlsplit(joined, "gemini")
    175                     is_relative = True
    176         return u, is_relative
    177 
    178     def _get_normalized_host(self):
    179         if not self.is_valid:
    180             return None
    181         if self._normalized_host is None:
    182             self._normalized_host = self.urlsplit.hostname
    183         return self._normalized_host
    184 
    185     def _get_normalized_host_like(self):
    186         if not self.is_valid:
    187             return None
    188         if self._normalized_host_like is None:
    189             normalized_host_like = self.normalized_host
    190             m = ROOT_LIKE_PATTERN.match(self.urlsplit.path)
    191             if m:
    192                 normalized_host_like += m[0].rstrip("/")
    193             self._normalized_host_like = normalized_host_like
    194         return self._normalized_host_like
    195 
    196     def _get_fetchable_url(self):
    197         if not self.is_valid:
    198             return None
    199         if self._fetchable_url is None:
    200             # we deliberately do not work with the fragment part
    201             self._fetchable_url = "{}://{}{}{}{}".format(
    202                 self.urlsplit.scheme, 
    203                 self.urlsplit.hostname, 
    204                 "" if self.urlsplit.port is None or self.urlsplit.port == 1965 else ":{}".format(self.urlsplit.port),
    205                 "/" if self.urlsplit.path == "" else self.urlsplit.path,
    206                 "" if self.urlsplit.query == "" else "?{}".format(self.urlsplit.query))
    207         return self._fetchable_url
    208 
    209     def _get_is_root_like(self):
    210         if self._is_root_like is None:
    211             is_root_like = False
    212             if (
    213                 self.urlsplit.path == ""
    214                 or self.urlsplit.path == "/"
    215                 or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path)
    216             ):
    217                 is_root_like = True
    218             self._is_root_like = is_root_like
    219         return self._is_root_like
    220 
    221     def _get_is_log_root_like(self):
    222         if self._is_log_root_like is None:
    223             is_log_root_like = False
    224             if (
    225                 self.urlsplit.path == ""
    226                 or self.urlsplit.path == "/"
    227                 or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path)
    228             ):
    229                 is_log_root_like = True
    230             self._is_log_root_like = is_log_root_like
    231         return self._is_log_root_like
    232 
    233     def _get_is_log_post_like(self):
    234         if self._is_log_post_like is None:
    235             is_log_post_like = False
    236             post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path)
    237             post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(
    238                 self.urlsplit.path
    239             )
    240             post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(
    241                 self.urlsplit.path
    242             )
    243             post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path)
    244 
    245             if (
    246                 (post_like_match and not post_like_exclusion_match)
    247                 or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match)
    248                 or (self.normalized_host == "gemini.conman.org" and post_boston_match)
    249             ):
    250                 is_log_post_like = True
    251             self._is_log_post_like = is_log_post_like
    252         return self._is_log_post_like
    253 
    254     def get_default_change_frequency(self, category):
    255         if not self.is_valid:
    256             return None
    257         if self._default_change_frequency is None:
    258             if category == "content":
    259                 if self.is_root_like or self.is_log_root_like:
    260                     change_frequency = constants.ROOT_CHANGE_FREQUENCY_DEFAULT
    261                 else:
    262                     change_frequency = constants.NON_ROOT_CHANGE_FREQUENCY_DEFAULT
    263             elif category == "binary":
    264                 change_frequency = constants.BINARY_CHANGE_FREQUENCY_DEFAULT
    265             elif category == "redirect":
    266                 change_frequency = constants.REDIRECT_CHANGE_FREQUENCY_DEFAULT
    267             elif category == "temp_error":
    268                 change_frequency = constants.TEMP_ERROR_CHANGE_FREQUENCY_DEFAULT
    269             elif category == "perm_error":
    270                 change_frequency = constants.PERM_ERROR_CHANGE_FREQUENCY_DEFAULT
    271             elif category == "prompt":
    272                 change_frequency = constants.PROMPT_CHANGE_FREQUENCY_DEFAULT
    273             else:
    274                 raise Exception.NameError("Unrecognized resource category")
    275 
    276             self._default_change_frequency = change_frequency
    277         return self._default_change_frequency
    278 
    279     def increment_change_frequency(self, existing_change_frequency, category):
    280         if category == "content":
    281             if self.is_root_like or self.is_log_root_like:
    282                 return existing_change_frequency + constants.ROOT_CHANGE_FREQUENCY_INCREMENT
    283             else:
    284                 return existing_change_frequency + constants.NON_ROOT_CHANGE_FREQUENCY_INCREMENT
    285         elif category == "binary":
    286             return existing_change_frequency + constants.BINARY_CHANGE_FREQUENCY_INCREMENT
    287         elif category == "redirect":
    288             return existing_change_frequency + constants.REDIRECT_CHANGE_FREQUENCY_INCREMENT
    289         elif category == "temp_error":
    290             return existing_change_frequency + constants.TEMP_ERROR_CHANGE_FREQUENCY_INCREMENT
    291         elif category == "perm_error":
    292             return existing_change_frequency + constants.PERM_ERROR_CHANGE_FREQUENCY_INCREMENT
    293         elif category == "prompt":
    294             return existing_change_frequency + constants.PROMPT_CHANGE_FREQUENCY_INCREMENT
    295         else:
    296             raise Exception.NameError("Unrecognized resource category")
    297 
    298 
    299     def fetch(self):
    300         # NB: this intentionally does NOT fetch the normalized URL, because that could
    301         # cause an infinite loop with, e.g., normalization stripping a trailing slash
    302         # and a server redirecting to the same URL _with_ a trailing slash.
    303         return gusmobile.fetch(self.fetchable_url)
    304 
    305 
    306     def extract_contained_resources(self, content):
    307         # this finds all gemini URLs within the content of a given GeminiResource and
    308         # returns them as a list of new GeminiResources
    309         if self.contained_resources:
    310             return self.contained_resources
    311 
    312         link_pattern = r"^=>\s*(\S+)"
    313         preformat_pattern = r"^```.*?^```"
    314         content_without_preformat = re.sub(
    315             preformat_pattern, "", content, flags=re.DOTALL | re.MULTILINE
    316         )
    317         probable_urls = re.findall(
    318             link_pattern, content_without_preformat, re.MULTILINE
    319         )
    320         resources = []
    321         for url in probable_urls:
    322             resource = GeminiResource(
    323                 url,
    324                 fully_qualified_parent_url=self.fetchable_url,
    325                 parent_hostname=self.urlsplit.hostname.lower(),
    326             )
    327             if resource.is_valid:
    328                 resources.append(resource)
    329         self.contained_resources = resources
    330 
    331         return self.contained_resources
    332 
    333     # constructed from fetchable_url
    334     # does not matter if quoted or unquoted so I choose arbitrarily to
    335     # standardize on unquoting it.
    336     normalized_host = property(_get_normalized_host)
    337     # constructed from urlsplit, should be quoted.
    338     fetchable_url = property(_get_fetchable_url)
    339     # constructed from fetchable_url, should be unquoted.
    340     is_root_like = property(_get_is_root_like)
    341     is_log_root_like = property(_get_is_log_root_like)
    342     is_log_post_like = property(_get_is_log_post_like)
    343     # pubnix-aware host version, means that the user-specific dir is appended
    344     normalized_host_like = property(_get_normalized_host_like)
    345