geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit cc7082f08d546dc4542263ce0068dec7b041f5a6
parent 586127b04bf8d0b70d8714f21d6be8a571ec01f4
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 10 May 2020 10:38:47 -0400

[crawl] Add indexed_at field

Diffstat:
MREADME.md | 1-
Mgus/crawl.py | 8+++++++-
2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md @@ -49,7 +49,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett add a TODO to refactor the extract_gemini_links function to exclude any links found within such a block. - **add user-facing documentation on searching by content type** -- **track freshness of content** - **track aggregate content statistics**: it would be nice to track some statistics about Geminispace over time, like perhaps: - total number of domains diff --git a/gus/crawl.py b/gus/crawl.py @@ -9,7 +9,7 @@ from urllib.parse import unquote, urljoin, urlparse, urlunparse, uses_relative, import gusmobile as gemini from whoosh.analysis import FancyAnalyzer -from whoosh.fields import Schema, TEXT +from whoosh.fields import Schema, TEXT, DATETIME from whoosh.filedb.filestore import FileStorage from whoosh.index import create_in, open_dir from whoosh.query import Every @@ -67,6 +67,9 @@ def create_index(index_dir): analyzer=FancyAnalyzer(), stored=True, ), + indexed_at=DATETIME( + stored=True + ), ) INDEX_STORAGE.create_index(schema) @@ -118,6 +121,7 @@ def index_binary(response): index_writer.add_document( url=response.url, content_type=response.content_type, + indexed_at=datetime.utcnow(), ) index_writer.commit() except: @@ -132,6 +136,7 @@ def index_prompt(response): url=response.url, content_type="input", prompt=response.prompt, + indexed_at=datetime.utcnow(), ) index_writer.commit() except: @@ -146,6 +151,7 @@ def index_content(response): url=response.url, content_type=response.content_type, content=response.content, + indexed_at=datetime.utcnow(), ) index_writer.commit() except: