geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit e103214e870beefeb2e219c0104a685e82945207
parent 8feea97f7c065bb9a6b77828cc1cd8e7bc4cf84d
Author: René Wagner <rwa@clttr.info>
Date:   Tue, 22 Aug 2023 10:14:39 +0200

new exclude and typo-fix on robots.txt usage doc

Diffstat:
Mgus/excludes.py | 5++++-
Mserve/templates/documentation/indexing.gmi | 2++
Mtests/gus/lib/test_gemini.py | 50+++++++++++++++++++++++++++++++++++++++-----------
3 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/gus/excludes.py b/gus/excludes.py @@ -135,6 +135,7 @@ EXCLUDED_URL_PREFIXES = [ "gemini://news.tuxmachines.org/", "gemini://musicdir.zachdecook.com/", "gemini://federal.cx/news", + "gemini://kypan.me/cgi", # wikipedia proxy "gemini://wp.pitr.ca/", @@ -197,8 +198,10 @@ EXCLUDED_URL_PREFIXES = [ # git repos "gemini://git.skyjake.fi", "gemini://gemini.unlimited.pizza/git", - # chess games or stuff + # games "gemini://jsreed5.org/live/", + "gemini://gemini.thegonz.net/ski", + "gemini://gemini.thegonz.net/gemski" ] EXCLUDED_URL_PATHS = [ diff --git a/serve/templates/documentation/indexing.gmi b/serve/templates/documentation/indexing.gmi @@ -33,6 +33,8 @@ To control crawling of your site, you can use a robots.txt file. Place it in you When interpreting a robots.txt, geminispace.info will use the first line that matches the URI that should be visited. Be sure to sort your rules accordingly if you want use exhaustive rules with wildcards or the "Allow" rule that is not specificed in the companion spec. +Your robots.txt should not include blank lines within a ruleset. Rules after a blank line will be ignored until the next ruleset is started with a "User-agent:" line. + geminispace.info obeys the following user-agents, listed in descending priority: * gus * indexer diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py @@ -90,7 +90,7 @@ class TestGeminiRobotFileParser: return rp def _assert_fetchable(self, rp, url="/", fetchable=True): - useragents = ["testbot", "genericbot", "*"] + useragents = ["gus", "indexer", "*"] assert rp.can_fetch_prioritized(useragents, url) == fetchable def test_empty_robots(self): @@ -102,19 +102,19 @@ class TestGeminiRobotFileParser: Disallow: /""") self._assert_fetchable(rp, "/", False) - def test_allow_genericbot(self): + def test_allow_indexer(self): rp = self._get_parser("""User-agent: * Disallow: / -User-agent: genericbot +User-agent: indexer Allow: /""") - self._assert_fetchable(rp) + self._assert_fetchable(rp, "/test", True) - def test_allow_genericbot_but_disallow_testbot(self): - rp = self._get_parser("""User-agent: genericbot + def test_allow_all_but_disallow_indexer(self): + rp = self._get_parser("""User-agent: * Allow: / -User-agent: testbot +User-agent: indexer Disallow: /""") self._assert_fetchable(rp, "/", False) @@ -122,18 +122,18 @@ Disallow: /""") rp = self._get_parser("""User-agent: * Allow: / -User-agent: genericbot +User-agent: indexer Disallow: /""") self._assert_fetchable(rp, "/", False) - def test_allow_only_testbot(self): + def test_allow_only_gus(self): rp = self._get_parser("""User-agent: * Disallow: / User-agent: genericbot Disallow: / -User-agent: testbot +User-agent: gus Allow: /""") self._assert_fetchable(rp) @@ -162,6 +162,34 @@ Disallow: /voteru/ # Robots are forbidden to enter the infinite maze Disallow: /maze Disallow: /maze/ - """) self._assert_fetchable(rp, "/maze/l/", False) + + def test_disallow_gemski_git(self): + rp = self._get_parser("""# disallowing because kineto at least doesn't have its own robots.txt to +# prevent web crawling by proxy +User-agent: webproxy +Disallow: / + +User-agent: archiver +User-agent: indexer +User-agent: researcher +Disallow: /gredig/ +# I'd like to just do this, but it seems at least some crawlers don't match by +# prefix. +#Disallow: /gemski/play +Disallow: /ski/ +Disallow: /gemski/ +# This doesn't exist, but GUS seems to get confused with the tuner server +# running on another port. +Disallow: /stations/ + +""") + self._assert_fetchable(rp, "/ski/sds", False) + + def test_disallow_unlimitedpizza_git(self): + rp = self._get_parser("""User-agent: * +Allow: /git +Disallow: /git/dotfiles/ +""") + self._assert_fetchable(rp, "/git/dotfiles/tree/0b0de929fa98457d22cbbcee65013ec261b660e2/atom/packages/ex-mode/node_modules/space-pen/node_modules/grim/node_modules/emissary/node_modules/es6-weak-map/node_modules/es5-ext/math/atanh/implement.js", False)