geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 567916e94091b1053d8c1fbe69732b1d4b9be99f
parent 20b2ccf59fbe3a55055d26535e7394acd0dc400d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 25 May 2020 21:44:46 -0400

[crawl] [serve] Add fetchable URL to the index

Diffstat:
Mgus/crawl.py | 6+++++-
Mgus/serve.py | 3++-
2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -10,7 +10,7 @@ from urllib.parse import urljoin, uses_relative, uses_netloc import gusmobile as gemini from whoosh.analysis import FancyAnalyzer -from whoosh.fields import Schema, TEXT, DATETIME +from whoosh.fields import Schema, TEXT, DATETIME, STORED from whoosh.filedb.filestore import FileStorage from whoosh.index import create_in, open_dir from whoosh.query import Every @@ -124,6 +124,7 @@ def create_index(index_dir): stored=True, analyzer=UrlAnalyzer(), ), + fetchable_url=STORED(), domain=TEXT( analyzer=UrlAnalyzer(), ), @@ -152,6 +153,7 @@ def index_binary(resource, response): try: index_writer.add_document( url=resource.indexable_url, + fetchable_url=resource.fetchable_url, domain=resource.normalized_host, content_type=response.content_type, indexed_at=datetime.utcnow(), @@ -167,6 +169,7 @@ def index_prompt(resource, response): try: index_writer.add_document( url=resource.indexable_url, + fetchable_url=resource.fetchable_url, domain=resource.normalized_host, content_type="input", prompt=response.prompt, @@ -183,6 +186,7 @@ def index_content(resource, response): try: index_writer.add_document( url=resource.indexable_url, + fetchable_url=resource.fetchable_url, domain=resource.normalized_host, content_type=response.content_type, content=response.content, diff --git a/gus/serve.py b/gus/serve.py @@ -252,6 +252,7 @@ def _search_index(query, requested_page): "score" : result.score, "indexed_at" : result["indexed_at"], "url" : result["url"], + "fetchable_url": result["fetchable_url"], "content_type" : result["content_type"], "prompt" : result["prompt"] if "prompt" in result else "", "highlights" : gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "", @@ -295,7 +296,7 @@ def _render_results(results, verbose=False): # performant way of stripping of the "gemini://" prefix at the # beginning of each URL. link_text = "{} ({}{})".format(result["url"][9:], result["content_type"], prompt_suffix) - data.append("=> {} {}".format(result["url"], link_text)) + data.append("=> {} {}".format(result["fetchable_url"], link_text)) if verbose: data.append("* Score : {:.2f}".format(result["score"])) data.append("* Indexed at : {:%Y-%m-%d %H:%M}".format(result["indexed_at"]))