[crawl] Add indexed_at field - geminispace.info

commit cc7082f08d546dc4542263ce0068dec7b041f5a6
parent 586127b04bf8d0b70d8714f21d6be8a571ec01f4
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 10 May 2020 10:38:47 -0400

[crawl] Add indexed_at field

Diffstat:
M README.md  | 1 -
M gus/crawl.py  | 8 +++++++-

2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
@@ -49,7 +49,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
   add a TODO to refactor the extract_gemini_links function to
   exclude any links found within such a block.
 - **add user-facing documentation on searching by content type**
-- **track freshness of content**
 - **track aggregate content statistics**: it would be nice to track
   some statistics about Geminispace over time, like perhaps:
   - total number of domains
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -9,7 +9,7 @@ from urllib.parse import unquote, urljoin, urlparse, urlunparse, uses_relative, 
 
 import gusmobile as gemini
 from whoosh.analysis import FancyAnalyzer
-from whoosh.fields import Schema, TEXT
+from whoosh.fields import Schema, TEXT, DATETIME
 from whoosh.filedb.filestore import FileStorage
 from whoosh.index import create_in, open_dir
 from whoosh.query import Every
@@ -67,6 +67,9 @@ def create_index(index_dir):
             analyzer=FancyAnalyzer(),
             stored=True,
         ),
+        indexed_at=DATETIME(
+            stored=True
+        ),
     )
     INDEX_STORAGE.create_index(schema)
 
@@ -118,6 +121,7 @@ def index_binary(response):
         index_writer.add_document(
             url=response.url,
             content_type=response.content_type,
+            indexed_at=datetime.utcnow(),
         )
         index_writer.commit()
     except:
@@ -132,6 +136,7 @@ def index_prompt(response):
             url=response.url,
             content_type="input",
             prompt=response.prompt,
+            indexed_at=datetime.utcnow(),
         )
         index_writer.commit()
     except:
@@ -146,6 +151,7 @@ def index_content(response):
             url=response.url,
             content_type=response.content_type,
             content=response.content,
+            indexed_at=datetime.utcnow(),
         )
         index_writer.commit()
     except:

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE