commit f10f1fc9a0b9a8171fa69a0d206d1b2acd1f3518
parent 484ef909792e8227a33fd54a648cbe36147b6627
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 18 May 2020 15:52:48 -0400
[crawl] Fix bug with computing full_qualified_urls
Diffstat:
3 files changed, 29 insertions(+), 10 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -30,7 +30,7 @@ INDEX_DIR = "index"
INDEX_STORAGE = FileStorage(INDEX_DIR)
SEED_URLS = [
- # english
+ # English
"gemini://80h.dev",
"gemini://berserk.red",
"gemini://bleyble.com",
@@ -72,19 +72,25 @@ SEED_URLS = [
"gemini://yam655.com",
"gemini://zaibatsu.circumlunar.space",
- # spanish
+ # Spanish
"gemini://gagarin.p4g.club",
]
+# These are checked against normalized_url, so they should be
+# prepended with the gemini:// protocol, be all lowercased, and
+# not have the port specified if it is 1965.
EXCLUDED_URL_PREFIXES = [
"gemini://example.org",
"gemini://example.com",
"gemini://gemini.conman.org/test",
"gemini://gemini.circumlunar.space/users/fgaz/calculator/",
+
+ # Internal
"gemini://gus.guru/search/",
"gemini://gus.guru/v/search/",
"gemini://gus.guru/search?",
"gemini://gus.guru/v/search?",
+ "gemini://gus.guru/add-seed?",
]
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -22,10 +22,11 @@ def urlsplit_featureful(url, parent_resource=None):
# This is important because I want to be able to use the host for a number of
# things behind the scenes.
+ is_relative = False
url = url.strip().rstrip("/")
u = urlsplit(url, 'gemini')
if u.scheme != "gemini":
- return None
+ return None, None
if u.hostname is None:
if url.startswith("/"):
# process relative link
@@ -33,6 +34,7 @@ def urlsplit_featureful(url, parent_resource=None):
return None
joined = urljoin("gemini://{}".format(parent_resource.normalized_host), url)
u = urlsplit(joined, 'gemini')
+ is_relative = True
else: # url does not start with /
# could be: blah.com/test
# could be: test
@@ -48,14 +50,15 @@ def urlsplit_featureful(url, parent_resource=None):
return None
joined = urljoin(parent_resource.normalized_host, url)
u = urlsplit(joined, 'gemini')
- return u
+ is_relative = True
+ return u, is_relative
class GeminiResource():
def __init__(self, url, parent_resource=None):
self.raw_url = url
self.parent_resource = parent_resource
- self.urlsplit = urlsplit_featureful(url, self.parent_resource)
+ self.urlsplit, self.is_relative = urlsplit_featureful(url, self.parent_resource)
self.is_valid = self.urlsplit is not None
self._normalized_url = None
self._normalized_host = None
@@ -64,7 +67,7 @@ class GeminiResource():
def _get_normalized_url(self):
- if self.urlsplit is None:
+ if not self.is_valid:
return None
if self._normalized_url is None:
self._normalized_url, self._normalized_host = self._get_normalized_url_and_host()
@@ -72,7 +75,7 @@ class GeminiResource():
def _get_normalized_host(self):
- if self.urlsplit is None:
+ if not self.is_valid:
return None
if self._normalized_host is None:
self._normalized_url, self._normalized_host = self._get_normalized_url_and_host()
@@ -80,10 +83,20 @@ class GeminiResource():
def _get_fully_qualified_url(self):
- if self.urlsplit is None:
+ if not self.is_valid:
return None
if self._fully_qualified_url is None:
- self._fully_qualified_url = self.raw_url if not self.raw_url.startswith("/") else self.normalized_url
+ if self.is_relative:
+ url = self.raw_url if not self.is_relative else self.normalized_url
+ else:
+ raw_url_lower = self.raw_url.lower()
+ if raw_url_lower.startswith("gemini://"):
+ url = self.raw_url
+ elif raw_url_lower.startswith("//"):
+ url = "gemini{}".format(self.raw_url)
+ else:
+ url = "gemini://{}".format(self.raw_url)
+ self._fully_qualified_url = url
return self._fully_qualified_url
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -51,7 +51,7 @@ def compute_index_statistics(index_dir):
}
-def print_index_statistics(index_statistics):
+def print_index_statistics(index_statistics, crawl_statistics):
print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"]))
print("Page Count : {:>6}".format(index_statistics["page_count"]))
print("Domain Count : {:>6}".format(index_statistics["domain_count"]))