commit cf846b8bb6f69b1bcf7c6c65e09250ca4675d51d
parent b833d19086bda01fa669e8a771c3d5ebc35daefe
Author: Natalie Pendragon <natpen@natpen.net>
Date: Thu, 11 Jun 2020 06:38:56 -0400
[crawl] Start indexing response sizes
Diffstat:
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -10,7 +10,7 @@ from urllib.parse import urljoin, uses_relative, uses_netloc
import gusmobile as gemini
from whoosh.analysis import FancyAnalyzer
-from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID
+from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC
from whoosh.filedb.filestore import FileStorage
from whoosh.index import create_in, open_dir
from whoosh.query import Every
@@ -178,6 +178,13 @@ def create_index(index_dir):
analyzer=FancyAnalyzer(),
stored=True,
),
+ size=NUMERIC(
+ int,
+ # this means GUS will have problems indexing responses over ~2GB
+ 32,
+ signed=False,
+ stored=True,
+ ),
indexed_at=DATETIME(
stored=True
),
@@ -194,6 +201,7 @@ def index_binary(resource, response):
domain=resource.normalized_host,
content_type=response.content_type,
charset=response.charset or "none",
+ size=response.num_bytes,
indexed_at=datetime.utcnow(),
)
@@ -206,6 +214,7 @@ def index_prompt(resource, response):
domain=resource.normalized_host,
content_type="input",
charset=response.charset or "none",
+ size=response.num_bytes,
prompt=response.prompt,
indexed_at=datetime.utcnow(),
)
@@ -220,6 +229,7 @@ def index_content(resource, response):
"content_type": response.content_type,
"charset": response.charset or "none",
"content": response.content,
+ "size": response.num_bytes,
"indexed_at": datetime.utcnow(),
}
if response.content_type == "text/gemini":
diff --git a/poetry.lock b/poetry.lock
@@ -100,7 +100,7 @@ python-versions = "*"
version = "0.1.0"
[package.source]
-reference = "2530369d280f883c4820a2c3bb7fd61785bc4e7d"
+reference = "47e40dcabd58ec0bf6347b1285d0a846af86f3aa"
type = "git"
url = "https://git.sr.ht/~natpen/gusmobile"
[[package]]