Rename fully_qualified_url to fetchable_url - geminispace.info

commit 370e53eabbc7649f4ac6e424f46efebebed4b8f8
parent 6adbcc2b60e7d7f84540b8b7e5064c54d3042ed1
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 24 May 2020 23:05:00 -0400

Rename fully_qualified_url to fetchable_url

Diffstat:
M gus/crawl.py  | 18 +++++++++---------
M gus/lib/gemini.py  | 20 ++++++++++----------

2 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -210,12 +210,12 @@ def crawl(gemini_resource):
     gr = gemini_resource
     for excluded_prefix in EXCLUDED_URL_PREFIXES:
         if gr.normalized_url.startswith(excluded_prefix):
-            print("MANUAL EXCLUSION SKIP  : %s" % gr.fully_qualified_url)
+            print("MANUAL EXCLUSION SKIP  : %s" % gr.fetchable_url)
             print("--------------------------")
             return
     for excluded_path in EXCLUDED_URL_PATHS:
         if gr.urlsplit.path.lower().endswith(excluded_path):
-            print("MANUAL EXCLUSION SKIP  : %s" % gr.fully_qualified_url)
+            print("MANUAL EXCLUSION SKIP  : %s" % gr.fetchable_url)
             print("--------------------------")
             return
 
@@ -228,11 +228,11 @@ def crawl(gemini_resource):
         crawl_delay = max(robots_file.crawl_delay("*"), crawl_delay or 0) if robots_file.crawl_delay("*") else crawl_delay
         crawl_delay = max(robots_file.crawl_delay("indexer"), crawl_delay or 0) if robots_file.crawl_delay("indexer") else crawl_delay
         if not can_fetch:
-            print("ROBOTS SKIP  : %s" % gr.fully_qualified_url)
+            print("ROBOTS SKIP  : %s" % gr.fetchable_url)
             print("--------------------------")
             return
     if gr.normalized_url in visited_urls:
-        print("ALREADY SEEN : %s" % gr.fully_qualified_url)
+        print("ALREADY SEEN : %s" % gr.fetchable_url)
         print("--------------------------")
         return
     else:
@@ -249,26 +249,26 @@ def crawl(gemini_resource):
     domain_hit_timings[gr.normalized_host] = datetime.now()
 
     # Actually fetch!
-    print("Fetching {}".format(gr.fully_qualified_url))
+    print("Fetching {}".format(gr.fetchable_url))
     if gr.fully_qualified_parent_url is not None:
         print("With parent {}".format(gr.fully_qualified_parent_url))
     response = gr.fetch()
 
     if response is None:
         # problem before getting a response
-        print("ERROR        : %s" % gr.fully_qualified_url)
+        print("ERROR        : %s" % gr.fetchable_url)
         print("--------------------------")
         crawl_statistics["broken_url_count"] += 1
     elif response.status.startswith("3"):
         # redirect status
-        print("REDIRECT     : %s -> %s" % (gr.fully_qualified_url, response.url))
+        print("REDIRECT     : %s -> %s" % (gr.fetchable_url, response.url))
         # NB: this pop is necessary because if the redirect is a change to the URL
         # structure of, essentially, the same URL (e.g., like the addition or removal
         # of a trailing slash), then the crawl of the redirect would think it had
         # already seen this resource in visited_urls' normalized source of truth.
         visited_urls.pop()
         crawl_statistics["redirect_count"] += 1
-        # if is_nontrivial_redirect(gr.fully_qualified_url, r.url):
+        # if is_nontrivial_redirect(gr.fetchable_url, r.url):
             # crawl_statistics["redirect_nontrivial_count"] += 1
         redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host)
         crawl(redirect_resource)
@@ -295,7 +295,7 @@ def crawl(gemini_resource):
             print("--------------------------")
     else:
         # input, error, etc (all other statuses)
-        print("UNHANDLED    : %s" % gr.fully_qualified_url)
+        print("UNHANDLED    : %s" % gr.fetchable_url)
         print("--------------------------")
 
 
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -45,7 +45,7 @@ class GeminiResource():
         self.fully_qualified_parent_url = fully_qualified_parent_url
         self._normalized_url = None
         self._normalized_host = None
-        self._fully_qualified_url = None
+        self._fetchable_url = None
         self._indexable_url = None
         self.contained_resources = None
 
@@ -103,10 +103,10 @@ class GeminiResource():
         return self._normalized_host
 
 
-    def _get_fully_qualified_url(self):
+    def _get_fetchable_url(self):
         if not self.is_valid:
             return None
-        if self._fully_qualified_url is None:
+        if self._fetchable_url is None:
             if self.is_relative:
                 # leave of fragment portion of urlsplit at [4]
                 urlsplit_parts = list(self.urlsplit[:4])
@@ -124,15 +124,15 @@ class GeminiResource():
                 # leave of fragment portion of urlsplit at [4]
                 if self.urlsplit[4] != "":
                     url = url.replace("#{}".format(self.urlsplit[4]), "")
-            self._fully_qualified_url = url
-        return self._fully_qualified_url
+            self._fetchable_url = url
+        return self._fetchable_url
 
 
     def _get_indexable_url(self):
         if not self.is_valid:
             return None
         if self._indexable_url is None:
-            indexable_url = self.fully_qualified_url
+            indexable_url = self.fetchable_url
             if self.urlsplit.port == 1965:
                 indexable_url = self.normalized_url.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
             self._indexable_url = indexable_url
@@ -140,18 +140,18 @@ class GeminiResource():
 
     normalized_url = property(_get_normalized_url)
     normalized_host = property(_get_normalized_host)
-    fully_qualified_url = property(_get_fully_qualified_url)
+    fetchable_url = property(_get_fetchable_url)
     indexable_url = property(_get_indexable_url)
 
     def fetch(self):
         # NB: this intentionally does NOT fetch the normalized URL, because that could
         # cause an infinite loop with, e.g., normalization stripping a trailing slash
         # and a server redirecting to the same URL _with_ a trailing slash.
-        return gusmobile.fetch(self.fully_qualified_url)
+        return gusmobile.fetch(self.fetchable_url)
 
 
     def _get_normalized_url_and_host(self):
-        url_normalized = self.fully_qualified_url.lower().rstrip("/")
+        url_normalized = self.fetchable_url.lower().rstrip("/")
         if self.urlsplit.port == 1965:
             url_normalized = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
         host_normalized = self.urlsplit.hostname.lower()
@@ -170,7 +170,7 @@ class GeminiResource():
         for url in probable_urls:
             resource = GeminiResource(
                 url,
-                fully_qualified_parent_url=self.fully_qualified_url,
+                fully_qualified_parent_url=self.fetchable_url,
                 parent_hostname=self.urlsplit.hostname,
             )
             if resource.is_valid:

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	18	+++++++++---------
M	gus/lib/gemini.py	\|	20	++++++++++----------