geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 5bfa28c579ad10ddc5588109229f6093e1e4f7ef
parent 190b9875c17508609978bafe4a88911a93bfc042
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 20 Jul 2020 08:19:03 -0400

[build_index] Build index with backlink_count instead of backlinks

This works because all the actual fetching of backlinks is now handled
by database queries, so we can slim down the whoosh index a bit with
this change.

Diffstat:
Mgus/build_index.py | 9+++++++--
Mserve/models.py | 22+++++++++++-----------
Mserve/templates/search.gmi | 8++++----
3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -98,7 +98,10 @@ def create_index(index_dir): signed=False, stored=True, ), - backlinks=KEYWORD( + backlink_count=NUMERIC( + int, + 16, # num bits, so max value is 65k + signed=False, stored=True, ), indexed_at=DATETIME( @@ -121,6 +124,8 @@ def index_page(page, indexed_urls): .join(to_page, on=(to_page.id == Link.to_page_id)) .where(to_page.url << [url, f"{url}/"]) .dicts()) + backlink_urls = [b["url"] for b in backlinks] + backlink_count = len(list(set(backlink_urls))) document = { "url_id": page.url, @@ -133,7 +138,7 @@ def index_page(page, indexed_urls): "lang": page.lang, "size": page.size, "indexed_at": page.indexed_at, - "backlinks": " ".join([b["url"] for b in backlinks]), + "backlink_count": backlink_count, "prompt": page.prompt, "content": page.content, } diff --git a/serve/models.py b/serve/models.py @@ -46,17 +46,17 @@ class GUS(): return ( len(results), [{ - "score" : result.score, - "indexed_at" : result["indexed_at"], - "url" : result["url"], - "fetchable_url": result["fetchable_url"], - "content_type" : result["content_type"], - "charset" : result["charset"] if "charset" in result else "none", - "size" : result["size"] if "size" in result else 0, - "prompt" : result["prompt"] if "prompt" in result else "", - "highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "", - "link_text" : GUS._get_link_text(result), - "backlinks" : list(set(result["backlinks"].split(" "))) if "backlinks" in result and result["backlinks"] != "" else [], + "score" : result.score, + "indexed_at" : result["indexed_at"], + "url" : result["url"], + "fetchable_url" : result["fetchable_url"], + "content_type" : result["content_type"], + "charset" : result["charset"] if "charset" in result else "none", + "size" : result["size"] if "size" in result else 0, + "prompt" : result["prompt"] if "prompt" in result else "", + "highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "", + "link_text" : GUS._get_link_text(result), + "backlink_count": result["backlink_count"], } for result in results], ) diff --git a/serve/templates/search.gmi b/serve/templates/search.gmi @@ -13,10 +13,10 @@ {% for result in results %} => {{ result["fetchable_url"] }} {{ result["link_text"] }} -{% if result["backlinks"] | length > 1 %} -=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlinks"] | length }} backlinks -{% elif result["backlinks"] | length > 0 %} -=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlinks"] | length }} backlink +{% if result["backlink_count"] > 1 %} +=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlink_count"] }} backlinks +{% elif result["backlink_count"] > 0 %} +=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlink_count"] }} backlink {% endif %} {% if verbose %} * Score : {{ "{:.2f}".format(result["score"]) }}