geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 370e53eabbc7649f4ac6e424f46efebebed4b8f8
parent 6adbcc2b60e7d7f84540b8b7e5064c54d3042ed1
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 24 May 2020 23:05:00 -0400

Rename fully_qualified_url to fetchable_url

Diffstat:
Mgus/crawl.py | 18+++++++++---------
Mgus/lib/gemini.py | 20++++++++++----------
2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -210,12 +210,12 @@ def crawl(gemini_resource): gr = gemini_resource for excluded_prefix in EXCLUDED_URL_PREFIXES: if gr.normalized_url.startswith(excluded_prefix): - print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url) + print("MANUAL EXCLUSION SKIP : %s" % gr.fetchable_url) print("--------------------------") return for excluded_path in EXCLUDED_URL_PATHS: if gr.urlsplit.path.lower().endswith(excluded_path): - print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url) + print("MANUAL EXCLUSION SKIP : %s" % gr.fetchable_url) print("--------------------------") return @@ -228,11 +228,11 @@ def crawl(gemini_resource): crawl_delay = max(robots_file.crawl_delay("*"), crawl_delay or 0) if robots_file.crawl_delay("*") else crawl_delay crawl_delay = max(robots_file.crawl_delay("indexer"), crawl_delay or 0) if robots_file.crawl_delay("indexer") else crawl_delay if not can_fetch: - print("ROBOTS SKIP : %s" % gr.fully_qualified_url) + print("ROBOTS SKIP : %s" % gr.fetchable_url) print("--------------------------") return if gr.normalized_url in visited_urls: - print("ALREADY SEEN : %s" % gr.fully_qualified_url) + print("ALREADY SEEN : %s" % gr.fetchable_url) print("--------------------------") return else: @@ -249,26 +249,26 @@ def crawl(gemini_resource): domain_hit_timings[gr.normalized_host] = datetime.now() # Actually fetch! - print("Fetching {}".format(gr.fully_qualified_url)) + print("Fetching {}".format(gr.fetchable_url)) if gr.fully_qualified_parent_url is not None: print("With parent {}".format(gr.fully_qualified_parent_url)) response = gr.fetch() if response is None: # problem before getting a response - print("ERROR : %s" % gr.fully_qualified_url) + print("ERROR : %s" % gr.fetchable_url) print("--------------------------") crawl_statistics["broken_url_count"] += 1 elif response.status.startswith("3"): # redirect status - print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, response.url)) + print("REDIRECT : %s -> %s" % (gr.fetchable_url, response.url)) # NB: this pop is necessary because if the redirect is a change to the URL # structure of, essentially, the same URL (e.g., like the addition or removal # of a trailing slash), then the crawl of the redirect would think it had # already seen this resource in visited_urls' normalized source of truth. visited_urls.pop() crawl_statistics["redirect_count"] += 1 - # if is_nontrivial_redirect(gr.fully_qualified_url, r.url): + # if is_nontrivial_redirect(gr.fetchable_url, r.url): # crawl_statistics["redirect_nontrivial_count"] += 1 redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host) crawl(redirect_resource) @@ -295,7 +295,7 @@ def crawl(gemini_resource): print("--------------------------") else: # input, error, etc (all other statuses) - print("UNHANDLED : %s" % gr.fully_qualified_url) + print("UNHANDLED : %s" % gr.fetchable_url) print("--------------------------") diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -45,7 +45,7 @@ class GeminiResource(): self.fully_qualified_parent_url = fully_qualified_parent_url self._normalized_url = None self._normalized_host = None - self._fully_qualified_url = None + self._fetchable_url = None self._indexable_url = None self.contained_resources = None @@ -103,10 +103,10 @@ class GeminiResource(): return self._normalized_host - def _get_fully_qualified_url(self): + def _get_fetchable_url(self): if not self.is_valid: return None - if self._fully_qualified_url is None: + if self._fetchable_url is None: if self.is_relative: # leave of fragment portion of urlsplit at [4] urlsplit_parts = list(self.urlsplit[:4]) @@ -124,15 +124,15 @@ class GeminiResource(): # leave of fragment portion of urlsplit at [4] if self.urlsplit[4] != "": url = url.replace("#{}".format(self.urlsplit[4]), "") - self._fully_qualified_url = url - return self._fully_qualified_url + self._fetchable_url = url + return self._fetchable_url def _get_indexable_url(self): if not self.is_valid: return None if self._indexable_url is None: - indexable_url = self.fully_qualified_url + indexable_url = self.fetchable_url if self.urlsplit.port == 1965: indexable_url = self.normalized_url.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1) self._indexable_url = indexable_url @@ -140,18 +140,18 @@ class GeminiResource(): normalized_url = property(_get_normalized_url) normalized_host = property(_get_normalized_host) - fully_qualified_url = property(_get_fully_qualified_url) + fetchable_url = property(_get_fetchable_url) indexable_url = property(_get_indexable_url) def fetch(self): # NB: this intentionally does NOT fetch the normalized URL, because that could # cause an infinite loop with, e.g., normalization stripping a trailing slash # and a server redirecting to the same URL _with_ a trailing slash. - return gusmobile.fetch(self.fully_qualified_url) + return gusmobile.fetch(self.fetchable_url) def _get_normalized_url_and_host(self): - url_normalized = self.fully_qualified_url.lower().rstrip("/") + url_normalized = self.fetchable_url.lower().rstrip("/") if self.urlsplit.port == 1965: url_normalized = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1) host_normalized = self.urlsplit.hostname.lower() @@ -170,7 +170,7 @@ class GeminiResource(): for url in probable_urls: resource = GeminiResource( url, - fully_qualified_parent_url=self.fully_qualified_url, + fully_qualified_parent_url=self.fetchable_url, parent_hostname=self.urlsplit.hostname, ) if resource.is_valid: