commit dd1c2ffdef2f69a091d719f52bf2b27ee469ee27
parent 78ca450d9f5c7b3557834cea1a9a467d7508a401
Author: Natalie Pendragon <natpen@natpen.net>
Date: Wed, 20 May 2020 09:33:58 -0400
[crawl] Index massaged URLs
Up to this point, we were indexing the URL from the gemini response
object. Instead, let's index something that's been a bit more
normalized and cleansed. We want to keep the capitalization, but strip
unnecessary ports and trailing slashes.
Diffstat:
2 files changed, 27 insertions(+), 13 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -137,13 +137,13 @@ def create_index(index_dir):
index_storage.create_index(schema)
-def index_binary(response):
+def index_binary(resource):
print("INDEXING BINARY...")
index_writer = index_storage.open_index().writer()
try:
index_writer.add_document(
- url=response.url,
- content_type=response.content_type,
+ url=resource.fully_qualified_massaged_url,
+ content_type=resource.response.content_type,
indexed_at=datetime.utcnow(),
)
index_writer.commit()
@@ -151,14 +151,14 @@ def index_binary(response):
index_writer.cancel()
-def index_prompt(response):
+def index_prompt(resource):
print("INDEXING PROMPT...")
index_writer = index_storage.open_index().writer()
try:
index_writer.add_document(
- url=response.url,
+ url=resource.fully_qualified_massaged_url,
content_type="input",
- prompt=response.prompt,
+ prompt=resource.response.prompt,
indexed_at=datetime.utcnow(),
)
index_writer.commit()
@@ -166,14 +166,14 @@ def index_prompt(response):
index_writer.cancel()
-def index_content(response):
+def index_content(resource):
print("INDEXING CONTENT...")
index_writer = index_storage.open_index().writer()
try:
index_writer.add_document(
- url=response.url,
- content_type=response.content_type,
- content=response.content,
+ url=resource.fully_qualified_massaged_url,
+ content_type=resource.response.content_type,
+ content=resource.response.content,
indexed_at=datetime.utcnow(),
)
index_writer.commit()
@@ -272,7 +272,7 @@ def crawl(gemini_resource):
print("URL : %s" % r.url)
print("STATUS : %s" % r.status)
print("PROMPT : %s" % r.prompt)
- index_prompt(r)
+ index_prompt(gr)
print("--------------------------")
elif r.status.startswith("2"):
# success status
@@ -280,13 +280,13 @@ def crawl(gemini_resource):
print("STATUS : %s" % r.status)
print("CONTENT TYPE : %s" % r.content_type)
if r.content_type.startswith("text/"):
- index_content(r)
+ index_content(gr)
print("--------------------------")
contained_resources = gr.extract_contained_resources()
for resource in contained_resources:
crawl(resource)
else:
- index_binary(r)
+ index_binary(gr)
print("--------------------------")
else:
# input, error, etc (all other statuses)
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -61,6 +61,7 @@ class GeminiResource():
self._normalized_url = None
self._normalized_host = None
self._fully_qualified_url = None
+ self._fully_qualified_massaged_url = None
self.contained_resources = None
@@ -98,9 +99,22 @@ class GeminiResource():
return self._fully_qualified_url
+ def _get_fully_qualified_massaged_url(self):
+ if not self.is_valid:
+ return None
+ if self._fully_qualified_massaged_url is None:
+ fully_qualified_massaged_url = self.fully_qualified_url
+ if "%" in fully_qualified_massaged_url:
+ fully_qualified_massaged_url = unquote(fully_qualified_massaged_url)
+ if self.urlsplit.port == 1965:
+ fully_qualified_massaged_url = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
+ self._fully_qualified_massaged_url = fully_qualified_massaged_url
+ return self._fully_qualified_massaged_url
+
normalized_url = property(_get_normalized_url)
normalized_host = property(_get_normalized_host)
fully_qualified_url = property(_get_fully_qualified_url)
+ fully_qualified_massaged_url = property(_get_fully_qualified_massaged_url)
def fetch(self):
# NB: this intentionally does NOT fetch the normalized URL, because that could