geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit dd1c2ffdef2f69a091d719f52bf2b27ee469ee27
parent 78ca450d9f5c7b3557834cea1a9a467d7508a401
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed, 20 May 2020 09:33:58 -0400

[crawl] Index massaged URLs

Up to this point, we were indexing the URL from the gemini response
object. Instead, let's index something that's been a bit more
normalized and cleansed. We want to keep the capitalization, but strip
unnecessary ports and trailing slashes.

Diffstat:
Mgus/crawl.py | 26+++++++++++++-------------
Mgus/lib/gemini.py | 14++++++++++++++
2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -137,13 +137,13 @@ def create_index(index_dir): index_storage.create_index(schema) -def index_binary(response): +def index_binary(resource): print("INDEXING BINARY...") index_writer = index_storage.open_index().writer() try: index_writer.add_document( - url=response.url, - content_type=response.content_type, + url=resource.fully_qualified_massaged_url, + content_type=resource.response.content_type, indexed_at=datetime.utcnow(), ) index_writer.commit() @@ -151,14 +151,14 @@ def index_binary(response): index_writer.cancel() -def index_prompt(response): +def index_prompt(resource): print("INDEXING PROMPT...") index_writer = index_storage.open_index().writer() try: index_writer.add_document( - url=response.url, + url=resource.fully_qualified_massaged_url, content_type="input", - prompt=response.prompt, + prompt=resource.response.prompt, indexed_at=datetime.utcnow(), ) index_writer.commit() @@ -166,14 +166,14 @@ def index_prompt(response): index_writer.cancel() -def index_content(response): +def index_content(resource): print("INDEXING CONTENT...") index_writer = index_storage.open_index().writer() try: index_writer.add_document( - url=response.url, - content_type=response.content_type, - content=response.content, + url=resource.fully_qualified_massaged_url, + content_type=resource.response.content_type, + content=resource.response.content, indexed_at=datetime.utcnow(), ) index_writer.commit() @@ -272,7 +272,7 @@ def crawl(gemini_resource): print("URL : %s" % r.url) print("STATUS : %s" % r.status) print("PROMPT : %s" % r.prompt) - index_prompt(r) + index_prompt(gr) print("--------------------------") elif r.status.startswith("2"): # success status @@ -280,13 +280,13 @@ def crawl(gemini_resource): print("STATUS : %s" % r.status) print("CONTENT TYPE : %s" % r.content_type) if r.content_type.startswith("text/"): - index_content(r) + index_content(gr) print("--------------------------") contained_resources = gr.extract_contained_resources() for resource in contained_resources: crawl(resource) else: - index_binary(r) + index_binary(gr) print("--------------------------") else: # input, error, etc (all other statuses) diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -61,6 +61,7 @@ class GeminiResource(): self._normalized_url = None self._normalized_host = None self._fully_qualified_url = None + self._fully_qualified_massaged_url = None self.contained_resources = None @@ -98,9 +99,22 @@ class GeminiResource(): return self._fully_qualified_url + def _get_fully_qualified_massaged_url(self): + if not self.is_valid: + return None + if self._fully_qualified_massaged_url is None: + fully_qualified_massaged_url = self.fully_qualified_url + if "%" in fully_qualified_massaged_url: + fully_qualified_massaged_url = unquote(fully_qualified_massaged_url) + if self.urlsplit.port == 1965: + fully_qualified_massaged_url = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1) + self._fully_qualified_massaged_url = fully_qualified_massaged_url + return self._fully_qualified_massaged_url + normalized_url = property(_get_normalized_url) normalized_host = property(_get_normalized_host) fully_qualified_url = property(_get_fully_qualified_url) + fully_qualified_massaged_url = property(_get_fully_qualified_massaged_url) def fetch(self): # NB: this intentionally does NOT fetch the normalized URL, because that could