commit cfb30cb4beeba71b2da0033439a4e88ace130561
parent 54803f61105bc2da0251fcb4006b25bc9b75c9dd
Author: René Wagner <rwa@clttr.info>
Date: Sun, 30 Jul 2023 12:07:00 +0200
rework fetchable_url generation
Diffstat:
3 files changed, 28 insertions(+), 114 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -138,17 +138,6 @@ dmypy.json
/*.crt
/*.key
-.bash_history
-.bashrc
-.config/
-.gitconfig
-.local/
-.poetry/
poetry.lock
-.sqlite_history
-.ssh/
-.vim/
-.viminfo
-.vimrc
-.profile
crawl.lock
+*.swp
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -131,7 +131,6 @@ class GeminiResource:
)
self.is_valid = self.urlsplit is not None
self.fully_qualified_parent_url = fully_qualified_parent_url
- self._normalized_url = None
self._normalized_host = None
self._normalized_host_like = None
self._fetchable_url = None
@@ -177,22 +176,11 @@ class GeminiResource:
is_relative = True
return u, is_relative
- def _get_normalized_url(self):
- if not self.is_valid:
- return None
- if self._normalized_url is None:
- url_normalized = unquote(self.fetchable_url)
- self._normalized_url = url_normalized.replace(
- self.urlsplit.hostname + ":1965",
- self.urlsplit.hostname.lower(),
- 1)
- return self._normalized_url
-
def _get_normalized_host(self):
if not self.is_valid:
return None
if self._normalized_host is None:
- self._normalized_host = self.urlsplit.hostname.lower()
+ self._normalized_host = self.urlsplit.hostname
return self._normalized_host
def _get_normalized_host_like(self):
@@ -210,29 +198,13 @@ class GeminiResource:
if not self.is_valid:
return None
if self._fetchable_url is None:
- if self.is_relative:
- # leave off fragment portion of urlsplit at [4]
- urlsplit_parts = list(self.urlsplit[:4])
- urlsplit_parts.append("")
-
- url = urlunsplit(urlsplit_parts)
- else:
- raw_url_lower = self.raw_url.lower()
- if raw_url_lower.startswith("gemini://"):
- url = self.raw_url
- elif raw_url_lower.startswith("//"):
- url = "gemini:{}".format(self.raw_url)
- else:
- url = "gemini://{}".format(self.raw_url)
- # leave off fragment portion of urlsplit at [4]
- if self.urlsplit[2] == "":
- url = url + "/"
- if self.urlsplit[4] != "":
- url = url.replace("#{}".format(self.urlsplit[4]), "")
-
- self._fetchable_url = url.replace(
- self.urlsplit.hostname + ":1965",
- self.urlsplit.hostname.lower(), 1)
+ # we deliberately do not work with the fragment part
+ self._fetchable_url = "{}://{}{}{}{}".format(
+ self.urlsplit.scheme,
+ self.urlsplit.hostname,
+ "" if self.urlsplit.port is None or self.urlsplit.port == 1965 else ":{}".format(self.urlsplit.port),
+ "/" if self.urlsplit.path == "" else self.urlsplit.path,
+ "" if self.urlsplit.query == "" else "?{}".format(self.urlsplit.query))
return self._fetchable_url
def _get_is_root_like(self):
@@ -280,54 +252,6 @@ class GeminiResource:
self._is_log_post_like = is_log_post_like
return self._is_log_post_like
- def get_friendly_author(self, content):
- if not self.is_valid:
- return None
- friendly_author = None
- author_url_match = AUTHOR_URL_PATTERN.match(self.urlsplit.path)
- if author_url_match:
- # first check url
- if author_url_match[1]:
- friendly_author = author_url_match[1]
- elif author_url_match[2]:
- friendly_author = author_url_match[2]
- if friendly_author is None:
- # if no URL match, try looking in page content
- if isinstance(content, str):
- author_content_match = AUTHOR_CONTENT_PATTERN.match(content)
- if author_content_match:
- friendly_author = author_content_match[1]
- if friendly_author is None:
- # if still no match, use normalized host
- friendly_author = self.normalized_host
- return friendly_author
-
- def get_friendly_title(self, content):
- if not self.is_valid:
- return None
- friendly_title = None
-
- if isinstance(content, str):
- title_content_match = TITLE_CONTENT_PATTERN.match(content)
- if title_content_match:
- # first try page content
- friendly_title = title_content_match[1]
- if friendly_title is None:
- # if no content match, try looking in URL
- title_url_match = TITLE_URL_PATTERN.match(self.urlsplit.path)
- if title_url_match:
- friendly_title = (
- title_url_match[2]
- .replace("-", " ")
- .replace("_", " ")
- .strip()
- .title()
- )
- if friendly_title is None:
- # if still no match, use URL path
- friendly_title = self.urlsplit.path.lstrip("/")
- return friendly_title
-
def get_default_change_frequency(self, category):
if not self.is_valid:
return None
@@ -399,7 +323,7 @@ class GeminiResource:
resource = GeminiResource(
url,
fully_qualified_parent_url=self.fetchable_url,
- parent_hostname=self.urlsplit.hostname,
+ parent_hostname=self.urlsplit.hostname.lower(),
)
if resource.is_valid:
resources.append(resource)
@@ -410,7 +334,6 @@ class GeminiResource:
# constructed from fetchable_url
# does not matter if quoted or unquoted so I choose arbitrarily to
# standardize on unquoting it.
- #normalized_url = property(_get_normalized_url)
normalized_host = property(_get_normalized_host)
# constructed from urlsplit or raw_url
# should be quoted.
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -43,27 +43,29 @@ text
assert resources[1].raw_url == "other"
@pytest.mark.parametrize("test_input,expected_result", [
- (["gemini://gus.guru", None, None], [True, "gemini://gus.guru/", "gus.guru", "gemini://gus.guru/"]),
- (["/bar", "gemini://gus.guru/foo", None], [False, None, None, None]),
- (["/bar", "gemini://gus.guru/foo/", None], [False, None, None, None]),
- (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]),
- (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]),
- (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]),
- (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/foo/bar/", "gus.guru", "gemini://gus.guru/foo/bar/"]),
- (["//foo.com", None, None], [True, "gemini://foo.com/", "foo.com", "gemini://foo.com/"]),
- (["gemini://gem.splatt9990.com/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"] ),
- (["gemini://gem.splatt9990.com:1965/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"]),
- (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gemini://gem.splatt9990.com:1966/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi"]),
- (["gemini://michaelnordmeyer.com", None, None], [True, "gemini://michaelnordmeyer.com/", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/"]),
- (["log.gmi", "gemini://michaelnordmeyer.com/", None], [True, "gemini://michaelnordmeyer.com/log.gmi", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi"]),
- (["Log.gmi", "gemini://michaelnordmeyer.com/", None], [True, "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi"]),
+ (["gemini://gus.guru", None, None], [True, "gus.guru", "gemini://gus.guru/"]),
+ (["gemini://gus.guru/search?text", None, None], [True, "gus.guru", "gemini://gus.guru/search?text"]),
+ (["/bar", "gemini://gus.guru/foo", None], [False, None, None]),
+ (["/bar", "gemini://gus.guru/foo/", None], [False, None, None]),
+ (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar"]),
+ (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar"]),
+ (["/bar?test", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar?test"]),
+ (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar"]),
+ (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/foo/bar/"]),
+ (["//foo.com", None, None], [True, "foo.com", "gemini://foo.com/"]),
+ (["gemini://gem.Splatt9990.com/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"] ),
+ (["gemini://gem.Splatt9990.com:1965/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"]),
+ (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi"]),
+ (["gemini://MichaelNordmeyer.com", None, None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/"]),
+ (["log.gmi", "gemini://MichaelNordmeyer.com:1965/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi"]),
+ (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi"]),
])
def test_url_parsing(self, test_input, expected_result):
gr = GeminiResource(test_input[0], test_input[1], test_input[2])
assert gr.is_valid == expected_result[0]
- assert gr.normalized_host == expected_result[2]
- assert gr.fetchable_url == expected_result[3]
+ assert gr.normalized_host == expected_result[1]
+ assert gr.fetchable_url == expected_result[2]
@pytest.mark.parametrize("test_url,expected_result", [