commit 370e53eabbc7649f4ac6e424f46efebebed4b8f8
parent 6adbcc2b60e7d7f84540b8b7e5064c54d3042ed1
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sun, 24 May 2020 23:05:00 -0400
Rename fully_qualified_url to fetchable_url
Diffstat:
2 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -210,12 +210,12 @@ def crawl(gemini_resource):
gr = gemini_resource
for excluded_prefix in EXCLUDED_URL_PREFIXES:
if gr.normalized_url.startswith(excluded_prefix):
- print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url)
+ print("MANUAL EXCLUSION SKIP : %s" % gr.fetchable_url)
print("--------------------------")
return
for excluded_path in EXCLUDED_URL_PATHS:
if gr.urlsplit.path.lower().endswith(excluded_path):
- print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url)
+ print("MANUAL EXCLUSION SKIP : %s" % gr.fetchable_url)
print("--------------------------")
return
@@ -228,11 +228,11 @@ def crawl(gemini_resource):
crawl_delay = max(robots_file.crawl_delay("*"), crawl_delay or 0) if robots_file.crawl_delay("*") else crawl_delay
crawl_delay = max(robots_file.crawl_delay("indexer"), crawl_delay or 0) if robots_file.crawl_delay("indexer") else crawl_delay
if not can_fetch:
- print("ROBOTS SKIP : %s" % gr.fully_qualified_url)
+ print("ROBOTS SKIP : %s" % gr.fetchable_url)
print("--------------------------")
return
if gr.normalized_url in visited_urls:
- print("ALREADY SEEN : %s" % gr.fully_qualified_url)
+ print("ALREADY SEEN : %s" % gr.fetchable_url)
print("--------------------------")
return
else:
@@ -249,26 +249,26 @@ def crawl(gemini_resource):
domain_hit_timings[gr.normalized_host] = datetime.now()
# Actually fetch!
- print("Fetching {}".format(gr.fully_qualified_url))
+ print("Fetching {}".format(gr.fetchable_url))
if gr.fully_qualified_parent_url is not None:
print("With parent {}".format(gr.fully_qualified_parent_url))
response = gr.fetch()
if response is None:
# problem before getting a response
- print("ERROR : %s" % gr.fully_qualified_url)
+ print("ERROR : %s" % gr.fetchable_url)
print("--------------------------")
crawl_statistics["broken_url_count"] += 1
elif response.status.startswith("3"):
# redirect status
- print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, response.url))
+ print("REDIRECT : %s -> %s" % (gr.fetchable_url, response.url))
# NB: this pop is necessary because if the redirect is a change to the URL
# structure of, essentially, the same URL (e.g., like the addition or removal
# of a trailing slash), then the crawl of the redirect would think it had
# already seen this resource in visited_urls' normalized source of truth.
visited_urls.pop()
crawl_statistics["redirect_count"] += 1
- # if is_nontrivial_redirect(gr.fully_qualified_url, r.url):
+ # if is_nontrivial_redirect(gr.fetchable_url, r.url):
# crawl_statistics["redirect_nontrivial_count"] += 1
redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host)
crawl(redirect_resource)
@@ -295,7 +295,7 @@ def crawl(gemini_resource):
print("--------------------------")
else:
# input, error, etc (all other statuses)
- print("UNHANDLED : %s" % gr.fully_qualified_url)
+ print("UNHANDLED : %s" % gr.fetchable_url)
print("--------------------------")
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -45,7 +45,7 @@ class GeminiResource():
self.fully_qualified_parent_url = fully_qualified_parent_url
self._normalized_url = None
self._normalized_host = None
- self._fully_qualified_url = None
+ self._fetchable_url = None
self._indexable_url = None
self.contained_resources = None
@@ -103,10 +103,10 @@ class GeminiResource():
return self._normalized_host
- def _get_fully_qualified_url(self):
+ def _get_fetchable_url(self):
if not self.is_valid:
return None
- if self._fully_qualified_url is None:
+ if self._fetchable_url is None:
if self.is_relative:
# leave of fragment portion of urlsplit at [4]
urlsplit_parts = list(self.urlsplit[:4])
@@ -124,15 +124,15 @@ class GeminiResource():
# leave of fragment portion of urlsplit at [4]
if self.urlsplit[4] != "":
url = url.replace("#{}".format(self.urlsplit[4]), "")
- self._fully_qualified_url = url
- return self._fully_qualified_url
+ self._fetchable_url = url
+ return self._fetchable_url
def _get_indexable_url(self):
if not self.is_valid:
return None
if self._indexable_url is None:
- indexable_url = self.fully_qualified_url
+ indexable_url = self.fetchable_url
if self.urlsplit.port == 1965:
indexable_url = self.normalized_url.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
self._indexable_url = indexable_url
@@ -140,18 +140,18 @@ class GeminiResource():
normalized_url = property(_get_normalized_url)
normalized_host = property(_get_normalized_host)
- fully_qualified_url = property(_get_fully_qualified_url)
+ fetchable_url = property(_get_fetchable_url)
indexable_url = property(_get_indexable_url)
def fetch(self):
# NB: this intentionally does NOT fetch the normalized URL, because that could
# cause an infinite loop with, e.g., normalization stripping a trailing slash
# and a server redirecting to the same URL _with_ a trailing slash.
- return gusmobile.fetch(self.fully_qualified_url)
+ return gusmobile.fetch(self.fetchable_url)
def _get_normalized_url_and_host(self):
- url_normalized = self.fully_qualified_url.lower().rstrip("/")
+ url_normalized = self.fetchable_url.lower().rstrip("/")
if self.urlsplit.port == 1965:
url_normalized = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
host_normalized = self.urlsplit.hostname.lower()
@@ -170,7 +170,7 @@ class GeminiResource():
for url in probable_urls:
resource = GeminiResource(
url,
- fully_qualified_parent_url=self.fully_qualified_url,
+ fully_qualified_parent_url=self.fetchable_url,
parent_hostname=self.urlsplit.hostname,
)
if resource.is_valid: