commit 68f4fec7fc23d7f2d7ea13c4f1d08d41b6d25d11
parent e0f53546a9629c968c696cd3a15e3c88eca67a38
Author: Natalie Pendragon <natpen@natpen.net>
Date: Tue, 9 Jun 2020 06:55:11 -0400
[crawl] Start indexing lang parameter
Diffstat:
2 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -166,6 +166,9 @@ def create_index(index_dir):
charset=ID(
stored=True,
),
+ lang=ID(
+ stored=True,
+ ),
content=TEXT(
analyzer=FancyAnalyzer(),
spelling=True,
@@ -210,15 +213,18 @@ def index_prompt(resource, response):
def index_content(resource, response):
print("INDEXING CONTENT...")
- index_writer.add_document(
- url=resource.indexable_url,
- fetchable_url=resource.fetchable_url,
- domain=resource.normalized_host,
- content_type=response.content_type,
- charset=response.charset or "none",
- content=response.content,
- indexed_at=datetime.utcnow(),
- )
+ doc = {
+ "url": resource.indexable_url,
+ "fetchable_url": resource.fetchable_url,
+ "domain": resource.normalized_host,
+ "content_type": response.content_type,
+ "charset": response.charset or "none",
+ "content": response.content,
+ "indexed_at": datetime.utcnow(),
+ }
+ if response.content_type == "text/gemini":
+ doc["lang"] = response.lang or "none",
+ index_writer.add_document(**doc)
def get_robots_file(robot_host):
diff --git a/poetry.lock b/poetry.lock
@@ -100,7 +100,7 @@ python-versions = "*"
version = "0.1.0"
[package.source]
-reference = "c8867e2a90165958ae58e444791c0003329c6501"
+reference = "2530369d280f883c4820a2c3bb7fd61785bc4e7d"
type = "git"
url = "https://git.sr.ht/~natpen/gusmobile"
[[package]]