commit 5bfa28c579ad10ddc5588109229f6093e1e4f7ef
parent 190b9875c17508609978bafe4a88911a93bfc042
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 20 Jul 2020 08:19:03 -0400
[build_index] Build index with backlink_count instead of backlinks
This works because all the actual fetching of backlinks is now handled
by database queries, so we can slim down the whoosh index a bit with
this change.
Diffstat:
3 files changed, 22 insertions(+), 17 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -98,7 +98,10 @@ def create_index(index_dir):
signed=False,
stored=True,
),
- backlinks=KEYWORD(
+ backlink_count=NUMERIC(
+ int,
+ 16, # num bits, so max value is 65k
+ signed=False,
stored=True,
),
indexed_at=DATETIME(
@@ -121,6 +124,8 @@ def index_page(page, indexed_urls):
.join(to_page, on=(to_page.id == Link.to_page_id))
.where(to_page.url << [url, f"{url}/"])
.dicts())
+ backlink_urls = [b["url"] for b in backlinks]
+ backlink_count = len(list(set(backlink_urls)))
document = {
"url_id": page.url,
@@ -133,7 +138,7 @@ def index_page(page, indexed_urls):
"lang": page.lang,
"size": page.size,
"indexed_at": page.indexed_at,
- "backlinks": " ".join([b["url"] for b in backlinks]),
+ "backlink_count": backlink_count,
"prompt": page.prompt,
"content": page.content,
}
diff --git a/serve/models.py b/serve/models.py
@@ -46,17 +46,17 @@ class GUS():
return (
len(results),
[{
- "score" : result.score,
- "indexed_at" : result["indexed_at"],
- "url" : result["url"],
- "fetchable_url": result["fetchable_url"],
- "content_type" : result["content_type"],
- "charset" : result["charset"] if "charset" in result else "none",
- "size" : result["size"] if "size" in result else 0,
- "prompt" : result["prompt"] if "prompt" in result else "",
- "highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "",
- "link_text" : GUS._get_link_text(result),
- "backlinks" : list(set(result["backlinks"].split(" "))) if "backlinks" in result and result["backlinks"] != "" else [],
+ "score" : result.score,
+ "indexed_at" : result["indexed_at"],
+ "url" : result["url"],
+ "fetchable_url" : result["fetchable_url"],
+ "content_type" : result["content_type"],
+ "charset" : result["charset"] if "charset" in result else "none",
+ "size" : result["size"] if "size" in result else 0,
+ "prompt" : result["prompt"] if "prompt" in result else "",
+ "highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "",
+ "link_text" : GUS._get_link_text(result),
+ "backlink_count": result["backlink_count"],
} for result in results],
)
diff --git a/serve/templates/search.gmi b/serve/templates/search.gmi
@@ -13,10 +13,10 @@
{% for result in results %}
=> {{ result["fetchable_url"] }} {{ result["link_text"] }}
-{% if result["backlinks"] | length > 1 %}
-=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlinks"] | length }} backlinks
-{% elif result["backlinks"] | length > 0 %}
-=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlinks"] | length }} backlink
+{% if result["backlink_count"] > 1 %}
+=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlink_count"] }} backlinks
+{% elif result["backlink_count"] > 0 %}
+=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlink_count"] }} backlink
{% endif %}
{% if verbose %}
* Score : {{ "{:.2f}".format(result["score"]) }}