geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 68f4fec7fc23d7f2d7ea13c4f1d08d41b6d25d11
parent e0f53546a9629c968c696cd3a15e3c88eca67a38
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue,  9 Jun 2020 06:55:11 -0400

[crawl] Start indexing lang parameter

Diffstat:
Mgus/crawl.py | 24+++++++++++++++---------
Mpoetry.lock | 2+-
2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -166,6 +166,9 @@ def create_index(index_dir): charset=ID( stored=True, ), + lang=ID( + stored=True, + ), content=TEXT( analyzer=FancyAnalyzer(), spelling=True, @@ -210,15 +213,18 @@ def index_prompt(resource, response): def index_content(resource, response): print("INDEXING CONTENT...") - index_writer.add_document( - url=resource.indexable_url, - fetchable_url=resource.fetchable_url, - domain=resource.normalized_host, - content_type=response.content_type, - charset=response.charset or "none", - content=response.content, - indexed_at=datetime.utcnow(), - ) + doc = { + "url": resource.indexable_url, + "fetchable_url": resource.fetchable_url, + "domain": resource.normalized_host, + "content_type": response.content_type, + "charset": response.charset or "none", + "content": response.content, + "indexed_at": datetime.utcnow(), + } + if response.content_type == "text/gemini": + doc["lang"] = response.lang or "none", + index_writer.add_document(**doc) def get_robots_file(robot_host): diff --git a/poetry.lock b/poetry.lock @@ -100,7 +100,7 @@ python-versions = "*" version = "0.1.0" [package.source] -reference = "c8867e2a90165958ae58e444791c0003329c6501" +reference = "2530369d280f883c4820a2c3bb7fd61785bc4e7d" type = "git" url = "https://git.sr.ht/~natpen/gusmobile" [[package]]