geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit cbe22de43a084b7b2c288d5867cf24bbb0ce8414
parent b8eb04a22445fa63d716c1aaf03e0ae4bbf8b636
Author: René Wagner <rwa@clttr.info>
Date:   Tue, 20 Jul 2021 19:14:39 +0200

small fixes and doc adjustments

Diffstat:
Mgus/excludes.py | 15+++++++--------
Mserve/models.py | 2+-
Mserve/templates/documentation/indexing.gmi | 5++++-
Mserve/templates/news.gmi | 3+++
4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/gus/excludes.py b/gus/excludes.py @@ -142,8 +142,11 @@ EXCLUDED_URL_PREFIXES = [ "gemini://hn.filiuspatris.net/", "gemini://schmittstefan.de/de/nachrichten/", "gemini://gmi.noulin.net/mobile", - - # wikipedia proxy + "gemini://jpfox.fr/rss/", + "gemini://illegaldrugs.net/cgi-bin/news.php/", + "gemini://dw.schettler.net/", + + # wikipedia proxy "gemini://wp.pitr.ca/", "gemini://wp.glv.one/", "gemini://wikipedia.geminet.org/", @@ -192,17 +195,13 @@ EXCLUDED_URL_PREFIXES = [ "gemini://drewdevault.com/cgi-bin/web.sh?", "gemini://gemiprox.pollux.casa/", "gemini://gemiprox.pollux.casa:1966", - + "gemini://ecs.d2evs.net/proxy/", # killing crawl, I think maybe because it's too big - "gemini://gem.denarii.cloud/pichaindata.zip", + "gemini://gem.denarii.cloud/", # these threads seem to expire "gemini://dioskouroi.xyz/thread", - # news mirrors, there's just too much - "gemini://jpfox.fr/rss/", - "gemini://illegaldrugs.net/cgi-bin/news.php/", - "gemini://dw.schettler.net/", # docs - not our business "gemini://cfdocs.wetterberg.nu/", diff --git a/serve/models.py b/serve/models.py @@ -2,7 +2,7 @@ import re from datetime import datetime from . import constants -from gus.lib.db_model import init_db, Page, Thread +from gus.lib.db_model import init_db, Page from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import ( compute_index_statistics, diff --git a/serve/templates/documentation/indexing.gmi b/serve/templates/documentation/indexing.gmi @@ -20,7 +20,10 @@ Please note that GUS' indexing has provisions for manually excluding content fro To control crawling of your site, you can use a robots.txt file, Place it in your capsule's root directory such that a request for "robots.txt" will fetch it. It should be returned with a mimetype of `text/plain`. -GUS obeys User-agents of "indexer" and "*". +geminispace.info obeys the following user-agents, listed in descending priority: +* gus +* indexer +* * ### How can I recognize GUS requests? diff --git a/serve/templates/news.gmi b/serve/templates/news.gmi @@ -2,6 +2,9 @@ ## News +### 2021-07-20 +Thanks to the contribution of Hannu Hartikainen geminispace.info now is again able to honor the user-agents "gus", "indexer" and "*" in robots.txt. + ### 2021-07-11 The revamped data store seems to work fine so far. Unfortunately i had to disable the "newest hosts" and "newest pages" sites as the data is currently not available. I'll add that back again later, but before this i'd like to have the cleanup mechanismn implemented to get rid of old data from capsules that are no longer available.