commit cc7082f08d546dc4542263ce0068dec7b041f5a6
parent 586127b04bf8d0b70d8714f21d6be8a571ec01f4
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sun, 10 May 2020 10:38:47 -0400
[crawl] Add indexed_at field
Diffstat:
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
@@ -49,7 +49,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
add a TODO to refactor the extract_gemini_links function to
exclude any links found within such a block.
- **add user-facing documentation on searching by content type**
-- **track freshness of content**
- **track aggregate content statistics**: it would be nice to track
some statistics about Geminispace over time, like perhaps:
- total number of domains
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -9,7 +9,7 @@ from urllib.parse import unquote, urljoin, urlparse, urlunparse, uses_relative,
import gusmobile as gemini
from whoosh.analysis import FancyAnalyzer
-from whoosh.fields import Schema, TEXT
+from whoosh.fields import Schema, TEXT, DATETIME
from whoosh.filedb.filestore import FileStorage
from whoosh.index import create_in, open_dir
from whoosh.query import Every
@@ -67,6 +67,9 @@ def create_index(index_dir):
analyzer=FancyAnalyzer(),
stored=True,
),
+ indexed_at=DATETIME(
+ stored=True
+ ),
)
INDEX_STORAGE.create_index(schema)
@@ -118,6 +121,7 @@ def index_binary(response):
index_writer.add_document(
url=response.url,
content_type=response.content_type,
+ indexed_at=datetime.utcnow(),
)
index_writer.commit()
except:
@@ -132,6 +136,7 @@ def index_prompt(response):
url=response.url,
content_type="input",
prompt=response.prompt,
+ indexed_at=datetime.utcnow(),
)
index_writer.commit()
except:
@@ -146,6 +151,7 @@ def index_content(response):
url=response.url,
content_type=response.content_type,
content=response.content,
+ indexed_at=datetime.utcnow(),
)
index_writer.commit()
except: