geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 02d9691d378fbfde7c672419046fc7cbb40c3a1a
parent 7098386ec92e0edeb954782b402b9a8341678fae
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed,  4 Mar 2020 08:08:03 -0500

Update indexing and query parsing

Diffstat:
Mgus/crawl.py | 20++++++++++++++++----
Mgus/serve.py | 4++--
Agus/whoosh_extensions.py | 18++++++++++++++++++
3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -5,9 +5,12 @@ from urllib import robotparser from urllib.parse import urlparse, urlunparse import gusmobile as gemini +from whoosh.analysis import FancyAnalyzer from whoosh.fields import Schema, TEXT from whoosh.index import create_in +from gus.whoosh_extensions import UrlAnalyzer + INDEX_DIR = "index" SEED_URLS = [ @@ -33,9 +36,18 @@ def create_index(index_dir): shutil.rmtree(index_dir) pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) schema = Schema( - url=TEXT(stored=True), - content_type=TEXT(stored=True), - content=TEXT(stored=True), + url=TEXT( + field_boost=2.0, + stored=True, + analyzer=UrlAnalyzer(), + ), + content_type=TEXT( + stored=True, + ), + content=TEXT( + analyzer=FancyAnalyzer(), + spelling=True, + ), ) index = create_in("index", schema) index_writer = index.writer() @@ -78,7 +90,7 @@ def index_content(response): index_writer.add_document( url=response.url, content_type=response.content_type, - content=response.url + " " + response.content, + content=response.content, ) diff --git a/gus/serve.py b/gus/serve.py @@ -8,7 +8,7 @@ import sys import jetforce from jetforce import Response, Status from whoosh.index import open_dir -from whoosh.qparser import QueryParser +from whoosh.qparser import MultifieldParser app = jetforce.JetforceApplication() @@ -65,7 +65,7 @@ def index(request): def _search_index(query): - query = QueryParser("content", ix.schema).parse(query) + query = MultifieldParser(["content", "url"], ix.schema).parse(query) results = searcher.search(query) return ( len(results), diff --git a/gus/whoosh_extensions.py b/gus/whoosh_extensions.py @@ -0,0 +1,18 @@ +from urllib.parse import urlparse + +from whoosh.analysis import IntraWordFilter, LowercaseFilter, RegexTokenizer, StemFilter + + +def UrlAnalyzer(): + """Composes a RegexTokenizer with a LowercaseFilter. + + >>> ana = UrlAnalyzer() + >>> [token.text for token in ana("gemini://foo.bar.baz/hum/drum?har=floom")] + ["foo", "bar", "baz", "hum", "drum"] + + """ + + return RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True) | IntraWordFilter() | LowercaseFilter() | StemFilter() + +# ana = UrlAnalyzer() +# print([token.text for token in ana("gemini://gemini.circumlunar.fundamentally:1965/blog/november2?har=floom")])