geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit f10f1fc9a0b9a8171fa69a0d206d1b2acd1f3518
parent 484ef909792e8227a33fd54a648cbe36147b6627
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 18 May 2020 15:52:48 -0400

[crawl] Fix bug with computing full_qualified_urls

Diffstat:
Mgus/crawl.py | 10++++++++--
Mgus/lib/gemini.py | 27++++++++++++++++++++-------
Mgus/lib/index_statistics.py | 2+-
3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -30,7 +30,7 @@ INDEX_DIR = "index" INDEX_STORAGE = FileStorage(INDEX_DIR) SEED_URLS = [ - # english + # English "gemini://80h.dev", "gemini://berserk.red", "gemini://bleyble.com", @@ -72,19 +72,25 @@ SEED_URLS = [ "gemini://yam655.com", "gemini://zaibatsu.circumlunar.space", - # spanish + # Spanish "gemini://gagarin.p4g.club", ] +# These are checked against normalized_url, so they should be +# prepended with the gemini:// protocol, be all lowercased, and +# not have the port specified if it is 1965. EXCLUDED_URL_PREFIXES = [ "gemini://example.org", "gemini://example.com", "gemini://gemini.conman.org/test", "gemini://gemini.circumlunar.space/users/fgaz/calculator/", + + # Internal "gemini://gus.guru/search/", "gemini://gus.guru/v/search/", "gemini://gus.guru/search?", "gemini://gus.guru/v/search?", + "gemini://gus.guru/add-seed?", ] diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -22,10 +22,11 @@ def urlsplit_featureful(url, parent_resource=None): # This is important because I want to be able to use the host for a number of # things behind the scenes. + is_relative = False url = url.strip().rstrip("/") u = urlsplit(url, 'gemini') if u.scheme != "gemini": - return None + return None, None if u.hostname is None: if url.startswith("/"): # process relative link @@ -33,6 +34,7 @@ def urlsplit_featureful(url, parent_resource=None): return None joined = urljoin("gemini://{}".format(parent_resource.normalized_host), url) u = urlsplit(joined, 'gemini') + is_relative = True else: # url does not start with / # could be: blah.com/test # could be: test @@ -48,14 +50,15 @@ def urlsplit_featureful(url, parent_resource=None): return None joined = urljoin(parent_resource.normalized_host, url) u = urlsplit(joined, 'gemini') - return u + is_relative = True + return u, is_relative class GeminiResource(): def __init__(self, url, parent_resource=None): self.raw_url = url self.parent_resource = parent_resource - self.urlsplit = urlsplit_featureful(url, self.parent_resource) + self.urlsplit, self.is_relative = urlsplit_featureful(url, self.parent_resource) self.is_valid = self.urlsplit is not None self._normalized_url = None self._normalized_host = None @@ -64,7 +67,7 @@ class GeminiResource(): def _get_normalized_url(self): - if self.urlsplit is None: + if not self.is_valid: return None if self._normalized_url is None: self._normalized_url, self._normalized_host = self._get_normalized_url_and_host() @@ -72,7 +75,7 @@ class GeminiResource(): def _get_normalized_host(self): - if self.urlsplit is None: + if not self.is_valid: return None if self._normalized_host is None: self._normalized_url, self._normalized_host = self._get_normalized_url_and_host() @@ -80,10 +83,20 @@ class GeminiResource(): def _get_fully_qualified_url(self): - if self.urlsplit is None: + if not self.is_valid: return None if self._fully_qualified_url is None: - self._fully_qualified_url = self.raw_url if not self.raw_url.startswith("/") else self.normalized_url + if self.is_relative: + url = self.raw_url if not self.is_relative else self.normalized_url + else: + raw_url_lower = self.raw_url.lower() + if raw_url_lower.startswith("gemini://"): + url = self.raw_url + elif raw_url_lower.startswith("//"): + url = "gemini{}".format(self.raw_url) + else: + url = "gemini://{}".format(self.raw_url) + self._fully_qualified_url = url return self._fully_qualified_url diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -51,7 +51,7 @@ def compute_index_statistics(index_dir): } -def print_index_statistics(index_statistics): +def print_index_statistics(index_statistics, crawl_statistics): print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"])) print("Page Count : {:>6}".format(index_statistics["page_count"])) print("Domain Count : {:>6}".format(index_statistics["domain_count"]))