geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit cf846b8bb6f69b1bcf7c6c65e09250ca4675d51d
parent b833d19086bda01fa669e8a771c3d5ebc35daefe
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 11 Jun 2020 06:38:56 -0400

[crawl] Start indexing response sizes

Diffstat:
Mgus/crawl.py | 12+++++++++++-
Mpoetry.lock | 2+-
2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -10,7 +10,7 @@ from urllib.parse import urljoin, uses_relative, uses_netloc import gusmobile as gemini from whoosh.analysis import FancyAnalyzer -from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID +from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC from whoosh.filedb.filestore import FileStorage from whoosh.index import create_in, open_dir from whoosh.query import Every @@ -178,6 +178,13 @@ def create_index(index_dir): analyzer=FancyAnalyzer(), stored=True, ), + size=NUMERIC( + int, + # this means GUS will have problems indexing responses over ~2GB + 32, + signed=False, + stored=True, + ), indexed_at=DATETIME( stored=True ), @@ -194,6 +201,7 @@ def index_binary(resource, response): domain=resource.normalized_host, content_type=response.content_type, charset=response.charset or "none", + size=response.num_bytes, indexed_at=datetime.utcnow(), ) @@ -206,6 +214,7 @@ def index_prompt(resource, response): domain=resource.normalized_host, content_type="input", charset=response.charset or "none", + size=response.num_bytes, prompt=response.prompt, indexed_at=datetime.utcnow(), ) @@ -220,6 +229,7 @@ def index_content(resource, response): "content_type": response.content_type, "charset": response.charset or "none", "content": response.content, + "size": response.num_bytes, "indexed_at": datetime.utcnow(), } if response.content_type == "text/gemini": diff --git a/poetry.lock b/poetry.lock @@ -100,7 +100,7 @@ python-versions = "*" version = "0.1.0" [package.source] -reference = "2530369d280f883c4820a2c3bb7fd61785bc4e7d" +reference = "47e40dcabd58ec0bf6347b1285d0a846af86f3aa" type = "git" url = "https://git.sr.ht/~natpen/gusmobile" [[package]]