commit 96731d16d3db6b277b5dacfcee6d4f4b91449a9c
parent a7ea73424895a1e73d0bcc7ea2dc1e6d28257ec6
Author: Natalie Pendragon <natpen@natpen.net>
Date: Fri, 22 May 2020 08:42:03 -0400
[serve] Update highlight scoring and rendering
Diffstat:
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/gus/lib/whoosh_extensions.py b/gus/lib/whoosh_extensions.py
@@ -1,3 +1,4 @@
+import math
import re
from urllib.parse import urlparse
@@ -32,7 +33,7 @@ class GeminiFormatter(highlight.Formatter):
# Return the text as you want it to appear in the highlighted
# string
- return ":%s:" % tokentext
+ return "%s" % tokentext
def format_fragment(self, fragment, replace=False):
@@ -69,6 +70,7 @@ class GeminiFormatter(highlight.Formatter):
special_char_pattern = re.compile("[^\w\s,\.;-\?\!']")
+link_pattern = re.compile("://|=>")
class GeminiScorer(highlight.FragmentScorer):
def __call__(self, f):
@@ -86,6 +88,9 @@ class GeminiScorer(highlight.FragmentScorer):
# lower quality for actual searches for source code, but that is a very
# small minority of searches in the current state of things).
num_special_chars = len(special_char_pattern.findall(f.text[f.startchar:f.endchar]))
- score -= 10 * num_special_chars
+ score -= 4 * num_special_chars + math.pow(num_special_chars, 1.5)
- return max(1, score)
+ num_links = len(link_pattern.findall(f.text[f.startchar:f.endchar]))
+ score -= 30 * num_links
+
+ return max(0, score)
diff --git a/gus/serve.py b/gus/serve.py
@@ -25,7 +25,7 @@ INDEX_DIR = "index"
app = jetforce.JetforceApplication()
gemini_highlighter = highlight.Highlighter(
formatter=GeminiFormatter(),
- fragmenter=highlight.ContextFragmenter(maxchars=120, surround=60),
+ fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80),
scorer=GeminiScorer(),
order=highlight.SCORE,
)