[crawl] [serve] Add search highlights - geminispace.info

commit fbc302284a81e74f81369ccc9be6e098e4b728e8
parent dd1c2ffdef2f69a091d719f52bf2b27ee469ee27
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 21 May 2020 06:45:28 -0400

[crawl] [serve] Add search highlights

Diffstat:
M gus/crawl.py  | 3 ++-
M gus/lib/whoosh_extensions.py  | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M gus/serve.py  | 34 ++++++++++++++++++++++++++++------

3 files changed, 106 insertions(+), 7 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -125,6 +125,7 @@ def create_index(index_dir):
         content=TEXT(
             analyzer=FancyAnalyzer(),
             spelling=True,
+            stored=True,
         ),
         prompt=TEXT(
             analyzer=FancyAnalyzer(),
@@ -323,7 +324,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
     global index_storage
     index_storage = FileStorage(index_dir)
     if should_run_destructive:
-        # backup_old_index(INDEX_DIR_CURRENT, INDEX_DIR_BACKUP)
+        backup_old_index(INDEX_DIR_CURRENT, INDEX_DIR_BACKUP)
         create_index(index_dir)
 
     global visited_urls
diff --git a/gus/lib/whoosh_extensions.py b/gus/lib/whoosh_extensions.py
@@ -1,5 +1,8 @@
+import re
+
 from urllib.parse import urlparse
 
+from whoosh import highlight
 from whoosh.analysis import IntraWordFilter, LowercaseFilter, RegexTokenizer, StemFilter
 
 
@@ -13,3 +16,76 @@ def UrlAnalyzer():
     """
 
     return RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True) | IntraWordFilter() | LowercaseFilter() | StemFilter()
+
+
+class GeminiFormatter(highlight.Formatter):
+    """Puts quotes around the fragments, and then splits them by
+    line and formats the lines as a bulleted list.
+    """
+
+    between = "<<HL-SPLIT>>"
+
+    def format_token(self, text, token, replace=False):
+        # Use the get_text function to get the text corresponding to the
+        # token
+        tokentext = highlight.get_text(text, token, replace)
+
+        # Return the text as you want it to appear in the highlighted
+        # string
+        return ":%s:" % tokentext
+
+
+    def format_fragment(self, fragment, replace=False):
+        """Returns a formatted version of the given text, using the "token"
+        objects in the given :class:`Fragment`.
+
+        :param fragment: a :class:`Fragment` object representing a list of
+            matches in the text.
+        :param replace: if True, the original text corresponding to each
+            match will be replaced with the value of the token object's
+            ``text`` attribute.
+        """
+
+        output = ["* ..."]
+        index = fragment.startchar
+        text = fragment.text
+
+        for t in fragment.matches:
+            if t.startchar is None:
+                continue
+            if t.startchar < index:
+                continue
+            if t.startchar > index:
+                output.append(self._text(text[index:t.startchar]))
+            output.append(self.format_token(text, t, replace))
+            index = t.endchar
+        output.append(self._text(text[index:fragment.endchar]))
+        output.append("...")
+
+        out_string = "".join(output)
+        out_string = out_string.replace("\n", " ").replace('\r', ' ')
+        out_string = ' '.join(out_string.split())
+        return out_string
+
+
+special_char_pattern = re.compile("[^\w\s,\.;-\?\!']")
+
+class GeminiScorer(highlight.FragmentScorer):
+    def __call__(self, f):
+        # Add up the boosts for the matched terms in this passage
+        score = sum(t.boost for t in f.matches)
+
+        # Favor diversity: multiply score by the number of separate
+        # terms matched
+        score *= (len(f.matched_terms) * 100) or 1
+
+        # lower the score substantially for any special characters we we find,
+        # where special characters are non-word characters that also are not
+        # typically found in textual content. This should penalize things like
+        # ascii art, as well as source code (which, I suppose will make snippets
+        # lower quality for actual searches for source code, but that is a very
+        # small minority of searches in the current state of things).
+        num_special_chars = len(special_char_pattern.findall(f.text[f.startchar:f.endchar]))
+        score -= 10 * num_special_chars
+
+        return max(1, score)
diff --git a/gus/serve.py b/gus/serve.py
@@ -12,14 +12,22 @@ import threading
 
 import jetforce
 from jetforce import Response, Status
+from whoosh import highlight
 from whoosh.index import open_dir
 from whoosh.qparser import MultifieldParser
 
-from gus.lib.index_statistics import compute_index_statistics, load_last_statistics_from_file
 from gus.crawl import run_crawl
+from gus.lib.index_statistics import compute_index_statistics, load_last_statistics_from_file
+from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer
 
 INDEX_DIR = "index"
 app = jetforce.JetforceApplication()
+gemini_highlighter = highlight.Highlighter(
+    formatter=GeminiFormatter(),
+    fragmenter=highlight.ContextFragmenter(maxchars=120, surround=60),
+    scorer=GeminiScorer(),
+    order=highlight.SCORE,
+)
 
 def load_and_compute_statistics(filename):
     statistics = load_last_statistics_from_file(filename)
@@ -99,6 +107,14 @@ def _render_news():
     ]
     news_items = [
         {
+            "date": "2020-05-21",
+            "content": "Added contextual highlights from result pages' content directly to GUS search results pages. This should give a nice preview of what to expect if you click through to the result!",
+        },
+        {
+            "date": "2020-05-19",
+            "content": "Added instantaneous indexing of seed requests! Submit a seed request and your site should begin showing up in closer to a few minutes than a few days, as it tended to before.",
+        },
+        {
             "date": "2020-05-16",
             "content": "Added verbose search result mode. This is meant as an aid to content creators in figuring out exactly when GUS crawled specific pages, and how those pages' content scores against other results in Geminispace. Documentation for this feature is available under the advanced searching section of the about page.",
         },
@@ -227,7 +243,8 @@ def _search_index(query, requested_page):
             "indexed_at"   : result["indexed_at"],
             "url"          : result["url"],
             "content_type" : result["content_type"],
-            "prompt"       : result["prompt"] if "prompt" in result else ""
+            "prompt"       : result["prompt"] if "prompt" in result else "",
+            "highlights"   : gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result else "",
         } for result in results
         ]
     )
@@ -264,11 +281,16 @@ def _render_results(results, verbose=False):
         prompt_suffix = ""
         if result["content_type"] == "input":
             prompt_suffix = ": {}".format(result["prompt"])
-        data.append("=> {}".format(result["url"]))
+        # NB: the `[9:]` indexing of the URL in the next line is just a
+        # performant way of stripping of the "gemini://" prefix at the
+        # beginning of each URL.
+        link_text = "{} ({}{})".format(result["url"][9:], result["content_type"], prompt_suffix)
+        data.append("=> {} {}".format(result["url"], link_text))
         if verbose:
-            data.append("{}{} | Score: {:>4.2f} | {:%Y-%m-%d %H:%M}".format(result["content_type"], prompt_suffix, result["score"], result["indexed_at"]))
-        else:
-            data.append("{}{}".format(result["content_type"], prompt_suffix))
+            data.append("* Score      : {:.2f}".format(result["score"]))
+            data.append("* Indexed at : {:%Y-%m-%d %H:%M}".format(result["indexed_at"]))
+        if len(result["highlights"]) > 0:
+            data.extend(result["highlights"].split(GeminiFormatter.between))
     return data

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	3	++-
M	gus/lib/whoosh_extensions.py	\|	76	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	gus/serve.py	\|	34	++++++++++++++++++++++++++++------