commit 02d9691d378fbfde7c672419046fc7cbb40c3a1a
parent 7098386ec92e0edeb954782b402b9a8341678fae
Author: Natalie Pendragon <natpen@natpen.net>
Date: Wed, 4 Mar 2020 08:08:03 -0500
Update indexing and query parsing
Diffstat:
3 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -5,9 +5,12 @@ from urllib import robotparser
from urllib.parse import urlparse, urlunparse
import gusmobile as gemini
+from whoosh.analysis import FancyAnalyzer
from whoosh.fields import Schema, TEXT
from whoosh.index import create_in
+from gus.whoosh_extensions import UrlAnalyzer
+
INDEX_DIR = "index"
SEED_URLS = [
@@ -33,9 +36,18 @@ def create_index(index_dir):
shutil.rmtree(index_dir)
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
schema = Schema(
- url=TEXT(stored=True),
- content_type=TEXT(stored=True),
- content=TEXT(stored=True),
+ url=TEXT(
+ field_boost=2.0,
+ stored=True,
+ analyzer=UrlAnalyzer(),
+ ),
+ content_type=TEXT(
+ stored=True,
+ ),
+ content=TEXT(
+ analyzer=FancyAnalyzer(),
+ spelling=True,
+ ),
)
index = create_in("index", schema)
index_writer = index.writer()
@@ -78,7 +90,7 @@ def index_content(response):
index_writer.add_document(
url=response.url,
content_type=response.content_type,
- content=response.url + " " + response.content,
+ content=response.content,
)
diff --git a/gus/serve.py b/gus/serve.py
@@ -8,7 +8,7 @@ import sys
import jetforce
from jetforce import Response, Status
from whoosh.index import open_dir
-from whoosh.qparser import QueryParser
+from whoosh.qparser import MultifieldParser
app = jetforce.JetforceApplication()
@@ -65,7 +65,7 @@ def index(request):
def _search_index(query):
- query = QueryParser("content", ix.schema).parse(query)
+ query = MultifieldParser(["content", "url"], ix.schema).parse(query)
results = searcher.search(query)
return (
len(results),
diff --git a/gus/whoosh_extensions.py b/gus/whoosh_extensions.py
@@ -0,0 +1,18 @@
+from urllib.parse import urlparse
+
+from whoosh.analysis import IntraWordFilter, LowercaseFilter, RegexTokenizer, StemFilter
+
+
+def UrlAnalyzer():
+ """Composes a RegexTokenizer with a LowercaseFilter.
+
+ >>> ana = UrlAnalyzer()
+ >>> [token.text for token in ana("gemini://foo.bar.baz/hum/drum?har=floom")]
+ ["foo", "bar", "baz", "hum", "drum"]
+
+ """
+
+ return RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True) | IntraWordFilter() | LowercaseFilter() | StemFilter()
+
+# ana = UrlAnalyzer()
+# print([token.text for token in ana("gemini://gemini.circumlunar.fundamentally:1965/blog/november2?har=floom")])