commit 567916e94091b1053d8c1fbe69732b1d4b9be99f
parent 20b2ccf59fbe3a55055d26535e7394acd0dc400d
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 25 May 2020 21:44:46 -0400
[crawl] [serve] Add fetchable URL to the index
Diffstat:
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -10,7 +10,7 @@ from urllib.parse import urljoin, uses_relative, uses_netloc
import gusmobile as gemini
from whoosh.analysis import FancyAnalyzer
-from whoosh.fields import Schema, TEXT, DATETIME
+from whoosh.fields import Schema, TEXT, DATETIME, STORED
from whoosh.filedb.filestore import FileStorage
from whoosh.index import create_in, open_dir
from whoosh.query import Every
@@ -124,6 +124,7 @@ def create_index(index_dir):
stored=True,
analyzer=UrlAnalyzer(),
),
+ fetchable_url=STORED(),
domain=TEXT(
analyzer=UrlAnalyzer(),
),
@@ -152,6 +153,7 @@ def index_binary(resource, response):
try:
index_writer.add_document(
url=resource.indexable_url,
+ fetchable_url=resource.fetchable_url,
domain=resource.normalized_host,
content_type=response.content_type,
indexed_at=datetime.utcnow(),
@@ -167,6 +169,7 @@ def index_prompt(resource, response):
try:
index_writer.add_document(
url=resource.indexable_url,
+ fetchable_url=resource.fetchable_url,
domain=resource.normalized_host,
content_type="input",
prompt=response.prompt,
@@ -183,6 +186,7 @@ def index_content(resource, response):
try:
index_writer.add_document(
url=resource.indexable_url,
+ fetchable_url=resource.fetchable_url,
domain=resource.normalized_host,
content_type=response.content_type,
content=response.content,
diff --git a/gus/serve.py b/gus/serve.py
@@ -252,6 +252,7 @@ def _search_index(query, requested_page):
"score" : result.score,
"indexed_at" : result["indexed_at"],
"url" : result["url"],
+ "fetchable_url": result["fetchable_url"],
"content_type" : result["content_type"],
"prompt" : result["prompt"] if "prompt" in result else "",
"highlights" : gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "",
@@ -295,7 +296,7 @@ def _render_results(results, verbose=False):
# performant way of stripping of the "gemini://" prefix at the
# beginning of each URL.
link_text = "{} ({}{})".format(result["url"][9:], result["content_type"], prompt_suffix)
- data.append("=> {} {}".format(result["url"], link_text))
+ data.append("=> {} {}".format(result["fetchable_url"], link_text))
if verbose:
data.append("* Score : {:.2f}".format(result["score"]))
data.append("* Indexed at : {:%Y-%m-%d %H:%M}".format(result["indexed_at"]))