geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 96731d16d3db6b277b5dacfcee6d4f4b91449a9c
parent a7ea73424895a1e73d0bcc7ea2dc1e6d28257ec6
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri, 22 May 2020 08:42:03 -0400

[serve] Update highlight scoring and rendering

Diffstat:
Mgus/lib/whoosh_extensions.py | 11++++++++---
Mgus/serve.py | 2+-
2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/gus/lib/whoosh_extensions.py b/gus/lib/whoosh_extensions.py @@ -1,3 +1,4 @@ +import math import re from urllib.parse import urlparse @@ -32,7 +33,7 @@ class GeminiFormatter(highlight.Formatter): # Return the text as you want it to appear in the highlighted # string - return ":%s:" % tokentext + return "%s" % tokentext def format_fragment(self, fragment, replace=False): @@ -69,6 +70,7 @@ class GeminiFormatter(highlight.Formatter): special_char_pattern = re.compile("[^\w\s,\.;-\?\!']") +link_pattern = re.compile("://|=>") class GeminiScorer(highlight.FragmentScorer): def __call__(self, f): @@ -86,6 +88,9 @@ class GeminiScorer(highlight.FragmentScorer): # lower quality for actual searches for source code, but that is a very # small minority of searches in the current state of things). num_special_chars = len(special_char_pattern.findall(f.text[f.startchar:f.endchar])) - score -= 10 * num_special_chars + score -= 4 * num_special_chars + math.pow(num_special_chars, 1.5) - return max(1, score) + num_links = len(link_pattern.findall(f.text[f.startchar:f.endchar])) + score -= 30 * num_links + + return max(0, score) diff --git a/gus/serve.py b/gus/serve.py @@ -25,7 +25,7 @@ INDEX_DIR = "index" app = jetforce.JetforceApplication() gemini_highlighter = highlight.Highlighter( formatter=GeminiFormatter(), - fragmenter=highlight.ContextFragmenter(maxchars=120, surround=60), + fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80), scorer=GeminiScorer(), order=highlight.SCORE, )