[crawl] Implement GeminiResource - geminispace.info

commit 53ce6aa50539d49837f9a9ef3f3ed9f9e50dfd2c
parent 4b123933cff8b0fe8203838c87451654439d2924
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 17 May 2020 10:20:11 -0400

[crawl] Implement GeminiResource

This commit should actually be somewhat close to a no-op, but brings
substantial refactoring of the code to consolidate both functionality
related to gemini URLs as well as the source of truth for crawler
information about them (including relevant metadata) to a new class
called `GeminiResource`.

Diffstat:
M gus/crawl.py  | 104 +++++++++++++++++++++++++++++++++----------------------------------------------
A gus/lib/gemini.py  | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M gus/lib/index_statistics.py  | 6 +++---
D gus/lib/url_helpers.py  | 21 ---------------------
M gus/serve.py  | 2 +-

5 files changed, 167 insertions(+), 86 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -19,7 +19,12 @@ from whoosh.qparser import QueryParser
 
 from gus.lib.index_statistics import compute_index_statistics, persist_statistics
 from gus.lib.whoosh_extensions import UrlAnalyzer
-from gus.lib.url_helpers import normalize_gemini_url
+from gus.lib.gemini import GeminiResource
+
+# hack(natpen): the built-in methods in urllib need to know the
+# Gemini protocol exists
+uses_relative.append("gemini")
+uses_netloc.append("gemini")
 
 INDEX_DIR = "index"
 INDEX_STORAGE = FileStorage(INDEX_DIR)
@@ -120,32 +125,6 @@ def create_index(index_dir):
     INDEX_STORAGE.create_index(schema)
 
 
-def clean_links(links, current_url):
-    clean_links = []
-    for link in links:
-        clean_link = link
-        u = urlparse(link)
-        if u.scheme != '' and u.scheme != "gemini":
-            continue
-        if u.netloc == '':
-            # relative link
-            clean_link = urljoin(current_url, clean_link)
-            u = urlparse(clean_link)
-        if u.port == 1965:
-            clean_link = clean_link.replace(u.hostname+":1965", u.hostname, 1)
-        if u.scheme is None:
-            clean_link = clean_link.replace(u.hostname, "gemini://"+u.hostname, 1)
-        clean_links.append(clean_link)
-    return clean_links
-
-
-def extract_gemini_links(content, current_url):
-    link_pattern = "^=>\s(\S+)"
-    links = re.findall(link_pattern, content, re.MULTILINE)
-    gemini_links = clean_links(links, current_url)
-    return gemini_links
-
-
 def index_binary(response):
     print("INDEXING BINARY...")
     index_writer = INDEX_STORAGE.open_index().writer()
@@ -192,8 +171,8 @@ def index_content(response):
 
 def get_robots_file(robot_host):
     if robot_host not in robot_file_map:
-        print(f"Requesting robots.txt for {robot_host}")
-        robot_url = robot_host + "/robots.txt"
+        print("Requesting robots.txt for {}".format(robot_host))
+        robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt")
         r = gemini.fetch(robot_url)
         if r is not None and r.status.startswith("2"):
             robot_file_parser = robotparser.RobotFileParser()
@@ -204,65 +183,67 @@ def get_robots_file(robot_host):
     return robot_file_map[robot_host]
 
 
-def crawl_url(url):
-    u = urlparse(url, 'gemini')
-    url = urlunparse(u)
-    path = u.path.lower().rstrip().rstrip('/')
-    normalized_url, normalized_host = normalize_gemini_url(url)
-    if normalized_url is None:
-        print("MANUAL EXCLUSION SKIP  : %s" % url)
-        print("--------------------------")
+def crawl(gemini_resource):
+    if not gemini_resource.is_valid:
         return
+
+    gr = gemini_resource
     for url_prefix in EXCLUDED_URL_PREFIXES:
-        if normalized_url.startswith(url_prefix):
-            print("MANUAL EXCLUSION SKIP  : %s" % url)
+        if gr.normalized_url.startswith(url_prefix):
+            print("MANUAL EXCLUSION SKIP  : %s" % gr.raw_url)
             print("--------------------------")
             return
 
     # ROBOTS
-    robots_file = get_robots_file(normalized_host)
+    robots_file = get_robots_file(gr.normalized_host)
     crawl_delay = None
     if robots_file is not None:
-        can_fetch = robots_file.can_fetch("gus", normalized_url) and robots_file.can_fetch("*", normalized_url) and robots_file.can_fetch("indexer", normalized_url)
+        can_fetch = robots_file.can_fetch("gus", gr.normalized_url) and robots_file.can_fetch("*", gr.normalized_url) and robots_file.can_fetch("indexer", gr.normalized_url)
         crawl_delay = robots_file.crawl_delay("gus")
         crawl_delay = max(robots_file.crawl_delay("*"), crawl_delay or 0) if robots_file.crawl_delay("*") else crawl_delay
         crawl_delay = max(robots_file.crawl_delay("indexer"), crawl_delay or 0) if robots_file.crawl_delay("indexer") else crawl_delay
         if not can_fetch:
-            print("ROBOTS SKIP  : %s" % url)
+            print("ROBOTS SKIP  : %s" % gr.raw_url)
             print("--------------------------")
             return
-    if normalized_url in visited_urls:
-        print("ALREADY SEEN : %s" % url)
+    if gr.normalized_url in visited_urls:
+        print("ALREADY SEEN : %s" % gr.raw_url)
         print("--------------------------")
         return
     else:
-        visited_urls.append(normalized_url)
+        visited_urls.append(gr.normalized_url)
 
     # Crawl delay
-    if normalized_host in domain_hit_timings:
+    if gr.normalized_host in domain_hit_timings:
         if crawl_delay is None:
-            next_allowed_hit = domain_hit_timings[normalized_host] + timedelta(milliseconds=500)
+            next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=500)
         else:
-            next_allowed_hit = domain_hit_timings[normalized_host] + timedelta(milliseconds=crawl_delay)
+            next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=crawl_delay)
         sleep_duration = max((next_allowed_hit - datetime.now()).total_seconds(), 0)
         time.sleep(sleep_duration)
-    domain_hit_timings[normalized_host] = datetime.now()
+    domain_hit_timings[gr.normalized_host] = datetime.now()
 
     # Actually fetch!
-    r = gemini.fetch(url)
+    r = gr.fetch()
+
     if r is None:
         # problem before getting a response
-        print("ERROR        : %s" % url)
+        print("ERROR        : %s" % gr.raw_url)
         print("--------------------------")
         crawl_statistics["broken_url_count"] += 1
     elif r.status.startswith("3"):
         # redirect status
-        print("REDIRECT     : %s -> %s" % (url, r.url))
+        print("REDIRECT     : %s -> %s" % (gr.raw_url, r.url))
+        # NB: this pop is necessary because if the redirect is a change to the URL
+        # structure of, essentially, the same URL (e.g., like the addition or removal
+        # of a trailing slash), then the crawl of the redirect would think it had
+        # already seen this resource in visited_urls' normalized source of truth.
         visited_urls.pop()
         crawl_statistics["redirect_count"] += 1
-        if is_nontrivial_redirect(url, r.url):
-            crawl_statistics["redirect_nontrivial_count"] += 1
-        crawl_url(r.url)
+        # if is_nontrivial_redirect(gr.raw_url, r.url):
+            # crawl_statistics["redirect_nontrivial_count"] += 1
+        redirect_resource = GeminiResource(r.url)
+        crawl(redirect_resource)
     elif r.status.startswith("1"):
         # input status
         print("URL          : %s" % r.url)
@@ -278,15 +259,15 @@ def crawl_url(url):
         if r.content_type.startswith("text/"):
             index_content(r)
             print("--------------------------")
-            gemini_links = extract_gemini_links(r.content, r.url)
-            for link in gemini_links:
-                crawl_url(link)
+            contained_resources = gr.extract_contained_resources()
+            for resource in contained_resources:
+                crawl(resource)
         else:
             index_binary(r)
             print("--------------------------")
     else:
         # input, error, etc (all other statuses)
-        print("UNHANDLED    : %s" % url)
+        print("UNHANDLED    : %s" % gr.raw_url)
         print("--------------------------")
 
 
@@ -321,8 +302,9 @@ def main():
         "redirect_nontrivial_count": 0,
         "broken_url_count": 0,
     }
-    for url in SEED_URLS:
-        crawl_url(url)
+    seed_resources = [GeminiResource(url) for url in SEED_URLS]
+    for resource in seed_resources:
+        crawl(resource)
 
     persist_visited_urls(visited_urls)
 
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -0,0 +1,120 @@
+import re
+from urllib.parse import unquote, urljoin, urlparse, urlsplit, urlunparse, urlunsplit, uses_relative, uses_netloc
+
+import gusmobile
+
+# hack(natpen): the built-in methods in urllib need to know the
+# Gemini protocol exists
+uses_relative.append("gemini")
+uses_netloc.append("gemini")
+
+
+def is_domain(possible_domain):
+    domain_pattern = "^((?=[a-z0-9-]{1,63}\.)(xn--)?[a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,63}$"
+    domain_match = re.match(domain_pattern, possible_domain, re.I)
+    return domain_match is not None
+
+
+def urlsplit_featureful(url, parent_resource=None):
+    # the point of this relatively complex function is to allow for protocol-less,
+    # double-slash-prepended-less URLs that still get treated as absolute (i.e.,
+    # non-relative) URLs and thus get their hosts parsed correctly by `urlsplit`.
+    # This is important because I want to be able to use the host for a number of
+    # things behind the scenes.
+
+    url = url.strip().rstrip("/")
+    u = urlsplit(url, 'gemini')
+    if u.scheme != "gemini":
+        return None
+    if u.hostname is None:
+        if url.startswith("/"):
+            # process relative link
+            if parent_resource is None:
+                return None
+            joined = urljoin(parent_resource.normalized_host, url)
+            u = urlsplit(joined, 'gemini')
+        else: # url does not start with /
+            # could be: blah.com/test
+            # could be: test
+            url_split = url.split("/")
+            if is_domain(url_split[0]):
+                # prepend with "gemini://" so built-in urlsplit will extract
+                # the host properly, and continue on
+                url = "gemini://{}".format(url)
+                u = urlsplit(url, 'gemini')
+            else:
+                # process relative link
+                if parent_resource is None:
+                    return None
+                joined = urljoin(parent_resource.normalized_host, url)
+                u = urlsplit(joined, 'gemini')
+    return u
+
+
+class GeminiResource():
+    def __init__(self, url, parent_resource=None):
+        self.raw_url = url
+        self.parent_resource = parent_resource
+        self.urlsplit = urlsplit_featureful(url, self.parent_resource)
+        self.is_valid = self.urlsplit is not None
+        self._normalized_url = None
+        self._normalized_host = None
+        self.contained_resources = None
+
+
+    def _get_normalized_url(self):
+        if self.urlsplit is None:
+            return None
+        if self._normalized_url is None:
+            self._normalized_url, self._normalized_host = self._get_normalized_url_and_host()
+        return self._normalized_url
+
+
+    def _get_normalized_host(self):
+        if self.urlsplit is None:
+            return None
+        if self._normalized_host is None:
+            self._normalized_url, self._normalized_host = self._get_normalized_url_and_host()
+        return self._normalized_host
+
+
+    normalized_url = property(_get_normalized_url)
+    normalized_host = property(_get_normalized_host)
+
+    def fetch(self):
+        # NB: this intentionally does NOT fetch the normalized URL, because that could
+        # cause an infinite loop with, e.g., normalization stripping a trailing slash
+        # and a server redirecting to the same URL _with_ a trailing slash.
+        response = gusmobile.fetch(self.raw_url)
+        self.response = response
+        return self.response
+
+
+    def _get_normalized_url_and_host(self):
+        url_normalized = urlunsplit(self.urlsplit)
+        if "%" in url_normalized:
+            url_normalized = unquote(url_normalized)
+        if self.urlsplit.port == 1965:
+            url_normalized = url_normalized.replace(u.hostname+":1965", u.hostname, 1)
+        host_normalized = self.urlsplit.hostname.lower()
+        return url_normalized, host_normalized
+
+
+    def extract_contained_resources(self):
+        # this finds all gemini URLs within the content of a given GeminiResource and
+        # returns them as a list of new, unfetched GeminiResources
+        if not self.response:
+            pass
+        if self.contained_resources:
+            return self.contained_resources
+
+        link_pattern = "^=>\s*(\S+)"
+        probable_urls = re.findall(link_pattern, self.response.content, re.MULTILINE)
+        resources = []
+        for url in probable_urls:
+            resource = GeminiResource(url, parent_resource=self)
+            if resource.is_valid:
+                resources.append(resource)
+        self.contained_resources = resources
+
+        return self.contained_resources
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -5,7 +5,7 @@ from whoosh.index import open_dir
 from whoosh.qparser import QueryParser
 from whoosh.query import Every
 
-from gus.lib.url_helpers import normalize_gemini_url
+from gus.lib.gemini import GeminiResource
 
 
 def compute_index_statistics(index_dir):
@@ -36,8 +36,8 @@ def compute_index_statistics(index_dir):
         results = searcher.search(query, limit=9999999)
         domains = set()
         for result in results:
-            _, domain = normalize_gemini_url(result["url"])
-            domains.add(domain)
+            gr = GeminiResource(result["url"])
+            domains.add(gr.normalized_host)
         domain_count = len(domains)
 
     # index modification time
diff --git a/gus/lib/url_helpers.py b/gus/lib/url_helpers.py
@@ -1,21 +0,0 @@
-from urllib.parse import unquote, urljoin, urlparse, urlunparse, uses_relative, uses_netloc
-
-# hack(natpen): the built-in methods in urllib need to know the
-# Gemini protocol exists
-uses_relative.append("gemini")
-uses_netloc.append("gemini")
-
-
-def normalize_gemini_url(url):
-    if "%" in url:
-        url = unquote(url)
-    u = urlparse(url.lower().strip().rstrip('/'), 'gemini')
-    if u.hostname is None:
-        return None, None
-    url_normalized = urlunparse(u)
-    if u.port == 1965:
-        url_normalized = url_normalized.replace(u.hostname+":1965", u.hostname, 1)
-    if u.scheme is None:
-        url_normalized = url_normalized.replace(u.hostname, "gemini://"+u.hostname, 1)
-    host_normalized = 'gemini://' + u.hostname
-    return url_normalized, host_normalized
diff --git a/gus/serve.py b/gus/serve.py
@@ -70,7 +70,7 @@ def _render_known_hosts():
         "",
     ]
     for domain in last_statistics["domains"]:
-        d.append("=> {}".format(domain))
+        d.append("=> gemini://{} {}".format(domain, domain))
     return d

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	104	+++++++++++++++++++++++++++++++++----------------------------------------------
A	gus/lib/gemini.py	\|	120	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	gus/lib/index_statistics.py	\|	6	+++---
D	gus/lib/url_helpers.py	\|	21	---------------------
M	gus/serve.py	\|	2	+-