geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit cfb30cb4beeba71b2da0033439a4e88ace130561
parent 54803f61105bc2da0251fcb4006b25bc9b75c9dd
Author: René Wagner <rwa@clttr.info>
Date:   Sun, 30 Jul 2023 12:07:00 +0200

rework fetchable_url generation

Diffstat:
M.gitignore | 13+------------
Mgus/lib/gemini.py | 95++++++++-----------------------------------------------------------------------
Mtests/gus/lib/test_gemini.py | 34++++++++++++++++++----------------
3 files changed, 28 insertions(+), 114 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -138,17 +138,6 @@ dmypy.json /*.crt /*.key -.bash_history -.bashrc -.config/ -.gitconfig -.local/ -.poetry/ poetry.lock -.sqlite_history -.ssh/ -.vim/ -.viminfo -.vimrc -.profile crawl.lock +*.swp diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -131,7 +131,6 @@ class GeminiResource: ) self.is_valid = self.urlsplit is not None self.fully_qualified_parent_url = fully_qualified_parent_url - self._normalized_url = None self._normalized_host = None self._normalized_host_like = None self._fetchable_url = None @@ -177,22 +176,11 @@ class GeminiResource: is_relative = True return u, is_relative - def _get_normalized_url(self): - if not self.is_valid: - return None - if self._normalized_url is None: - url_normalized = unquote(self.fetchable_url) - self._normalized_url = url_normalized.replace( - self.urlsplit.hostname + ":1965", - self.urlsplit.hostname.lower(), - 1) - return self._normalized_url - def _get_normalized_host(self): if not self.is_valid: return None if self._normalized_host is None: - self._normalized_host = self.urlsplit.hostname.lower() + self._normalized_host = self.urlsplit.hostname return self._normalized_host def _get_normalized_host_like(self): @@ -210,29 +198,13 @@ class GeminiResource: if not self.is_valid: return None if self._fetchable_url is None: - if self.is_relative: - # leave off fragment portion of urlsplit at [4] - urlsplit_parts = list(self.urlsplit[:4]) - urlsplit_parts.append("") - - url = urlunsplit(urlsplit_parts) - else: - raw_url_lower = self.raw_url.lower() - if raw_url_lower.startswith("gemini://"): - url = self.raw_url - elif raw_url_lower.startswith("//"): - url = "gemini:{}".format(self.raw_url) - else: - url = "gemini://{}".format(self.raw_url) - # leave off fragment portion of urlsplit at [4] - if self.urlsplit[2] == "": - url = url + "/" - if self.urlsplit[4] != "": - url = url.replace("#{}".format(self.urlsplit[4]), "") - - self._fetchable_url = url.replace( - self.urlsplit.hostname + ":1965", - self.urlsplit.hostname.lower(), 1) + # we deliberately do not work with the fragment part + self._fetchable_url = "{}://{}{}{}{}".format( + self.urlsplit.scheme, + self.urlsplit.hostname, + "" if self.urlsplit.port is None or self.urlsplit.port == 1965 else ":{}".format(self.urlsplit.port), + "/" if self.urlsplit.path == "" else self.urlsplit.path, + "" if self.urlsplit.query == "" else "?{}".format(self.urlsplit.query)) return self._fetchable_url def _get_is_root_like(self): @@ -280,54 +252,6 @@ class GeminiResource: self._is_log_post_like = is_log_post_like return self._is_log_post_like - def get_friendly_author(self, content): - if not self.is_valid: - return None - friendly_author = None - author_url_match = AUTHOR_URL_PATTERN.match(self.urlsplit.path) - if author_url_match: - # first check url - if author_url_match[1]: - friendly_author = author_url_match[1] - elif author_url_match[2]: - friendly_author = author_url_match[2] - if friendly_author is None: - # if no URL match, try looking in page content - if isinstance(content, str): - author_content_match = AUTHOR_CONTENT_PATTERN.match(content) - if author_content_match: - friendly_author = author_content_match[1] - if friendly_author is None: - # if still no match, use normalized host - friendly_author = self.normalized_host - return friendly_author - - def get_friendly_title(self, content): - if not self.is_valid: - return None - friendly_title = None - - if isinstance(content, str): - title_content_match = TITLE_CONTENT_PATTERN.match(content) - if title_content_match: - # first try page content - friendly_title = title_content_match[1] - if friendly_title is None: - # if no content match, try looking in URL - title_url_match = TITLE_URL_PATTERN.match(self.urlsplit.path) - if title_url_match: - friendly_title = ( - title_url_match[2] - .replace("-", " ") - .replace("_", " ") - .strip() - .title() - ) - if friendly_title is None: - # if still no match, use URL path - friendly_title = self.urlsplit.path.lstrip("/") - return friendly_title - def get_default_change_frequency(self, category): if not self.is_valid: return None @@ -399,7 +323,7 @@ class GeminiResource: resource = GeminiResource( url, fully_qualified_parent_url=self.fetchable_url, - parent_hostname=self.urlsplit.hostname, + parent_hostname=self.urlsplit.hostname.lower(), ) if resource.is_valid: resources.append(resource) @@ -410,7 +334,6 @@ class GeminiResource: # constructed from fetchable_url # does not matter if quoted or unquoted so I choose arbitrarily to # standardize on unquoting it. - #normalized_url = property(_get_normalized_url) normalized_host = property(_get_normalized_host) # constructed from urlsplit or raw_url # should be quoted. diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py @@ -43,27 +43,29 @@ text assert resources[1].raw_url == "other" @pytest.mark.parametrize("test_input,expected_result", [ - (["gemini://gus.guru", None, None], [True, "gemini://gus.guru/", "gus.guru", "gemini://gus.guru/"]), - (["/bar", "gemini://gus.guru/foo", None], [False, None, None, None]), - (["/bar", "gemini://gus.guru/foo/", None], [False, None, None, None]), - (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]), - (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]), - (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]), - (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/foo/bar/", "gus.guru", "gemini://gus.guru/foo/bar/"]), - (["//foo.com", None, None], [True, "gemini://foo.com/", "foo.com", "gemini://foo.com/"]), - (["gemini://gem.splatt9990.com/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"] ), - (["gemini://gem.splatt9990.com:1965/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"]), - (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gemini://gem.splatt9990.com:1966/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi"]), - (["gemini://michaelnordmeyer.com", None, None], [True, "gemini://michaelnordmeyer.com/", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/"]), - (["log.gmi", "gemini://michaelnordmeyer.com/", None], [True, "gemini://michaelnordmeyer.com/log.gmi", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi"]), - (["Log.gmi", "gemini://michaelnordmeyer.com/", None], [True, "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi"]), + (["gemini://gus.guru", None, None], [True, "gus.guru", "gemini://gus.guru/"]), + (["gemini://gus.guru/search?text", None, None], [True, "gus.guru", "gemini://gus.guru/search?text"]), + (["/bar", "gemini://gus.guru/foo", None], [False, None, None]), + (["/bar", "gemini://gus.guru/foo/", None], [False, None, None]), + (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar"]), + (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar"]), + (["/bar?test", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar?test"]), + (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar"]), + (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/foo/bar/"]), + (["//foo.com", None, None], [True, "foo.com", "gemini://foo.com/"]), + (["gemini://gem.Splatt9990.com/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"] ), + (["gemini://gem.Splatt9990.com:1965/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"]), + (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi"]), + (["gemini://MichaelNordmeyer.com", None, None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/"]), + (["log.gmi", "gemini://MichaelNordmeyer.com:1965/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi"]), + (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi"]), ]) def test_url_parsing(self, test_input, expected_result): gr = GeminiResource(test_input[0], test_input[1], test_input[2]) assert gr.is_valid == expected_result[0] - assert gr.normalized_host == expected_result[2] - assert gr.fetchable_url == expected_result[3] + assert gr.normalized_host == expected_result[1] + assert gr.fetchable_url == expected_result[2] @pytest.mark.parametrize("test_url,expected_result", [