[crawl] Index massaged URLs - geminispace.info

commit dd1c2ffdef2f69a091d719f52bf2b27ee469ee27
parent 78ca450d9f5c7b3557834cea1a9a467d7508a401
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed, 20 May 2020 09:33:58 -0400

[crawl] Index massaged URLs

Up to this point, we were indexing the URL from the gemini response
object. Instead, let's index something that's been a bit more
normalized and cleansed. We want to keep the capitalization, but strip
unnecessary ports and trailing slashes.

Diffstat:
M gus/crawl.py  | 26 +++++++++++++-------------
M gus/lib/gemini.py  | 14 ++++++++++++++

2 files changed, 27 insertions(+), 13 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -137,13 +137,13 @@ def create_index(index_dir):
     index_storage.create_index(schema)
 
 
-def index_binary(response):
+def index_binary(resource):
     print("INDEXING BINARY...")
     index_writer = index_storage.open_index().writer()
     try:
         index_writer.add_document(
-            url=response.url,
-            content_type=response.content_type,
+            url=resource.fully_qualified_massaged_url,
+            content_type=resource.response.content_type,
             indexed_at=datetime.utcnow(),
         )
         index_writer.commit()
@@ -151,14 +151,14 @@ def index_binary(response):
         index_writer.cancel()
 
 
-def index_prompt(response):
+def index_prompt(resource):
     print("INDEXING PROMPT...")
     index_writer = index_storage.open_index().writer()
     try:
         index_writer.add_document(
-            url=response.url,
+            url=resource.fully_qualified_massaged_url,
             content_type="input",
-            prompt=response.prompt,
+            prompt=resource.response.prompt,
             indexed_at=datetime.utcnow(),
         )
         index_writer.commit()
@@ -166,14 +166,14 @@ def index_prompt(response):
         index_writer.cancel()
 
 
-def index_content(response):
+def index_content(resource):
     print("INDEXING CONTENT...")
     index_writer = index_storage.open_index().writer()
     try:
         index_writer.add_document(
-            url=response.url,
-            content_type=response.content_type,
-            content=response.content,
+            url=resource.fully_qualified_massaged_url,
+            content_type=resource.response.content_type,
+            content=resource.response.content,
             indexed_at=datetime.utcnow(),
         )
         index_writer.commit()
@@ -272,7 +272,7 @@ def crawl(gemini_resource):
         print("URL          : %s" % r.url)
         print("STATUS       : %s" % r.status)
         print("PROMPT       : %s" % r.prompt)
-        index_prompt(r)
+        index_prompt(gr)
         print("--------------------------")
     elif r.status.startswith("2"):
         # success status
@@ -280,13 +280,13 @@ def crawl(gemini_resource):
         print("STATUS       : %s" % r.status)
         print("CONTENT TYPE : %s" % r.content_type)
         if r.content_type.startswith("text/"):
-            index_content(r)
+            index_content(gr)
             print("--------------------------")
             contained_resources = gr.extract_contained_resources()
             for resource in contained_resources:
                 crawl(resource)
         else:
-            index_binary(r)
+            index_binary(gr)
             print("--------------------------")
     else:
         # input, error, etc (all other statuses)
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -61,6 +61,7 @@ class GeminiResource():
         self._normalized_url = None
         self._normalized_host = None
         self._fully_qualified_url = None
+        self._fully_qualified_massaged_url = None
         self.contained_resources = None
 
 
@@ -98,9 +99,22 @@ class GeminiResource():
         return self._fully_qualified_url
 
 
+    def _get_fully_qualified_massaged_url(self):
+        if not self.is_valid:
+            return None
+        if self._fully_qualified_massaged_url is None:
+            fully_qualified_massaged_url = self.fully_qualified_url
+            if "%" in fully_qualified_massaged_url:
+                fully_qualified_massaged_url = unquote(fully_qualified_massaged_url)
+            if self.urlsplit.port == 1965:
+                fully_qualified_massaged_url = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
+            self._fully_qualified_massaged_url = fully_qualified_massaged_url
+        return self._fully_qualified_massaged_url
+
     normalized_url = property(_get_normalized_url)
     normalized_host = property(_get_normalized_host)
     fully_qualified_url = property(_get_fully_qualified_url)
+    fully_qualified_massaged_url = property(_get_fully_qualified_massaged_url)
 
     def fetch(self):
         # NB: this intentionally does NOT fetch the normalized URL, because that could

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	26	+++++++++++++-------------
M	gus/lib/gemini.py	\|	14	++++++++++++++