commit e103214e870beefeb2e219c0104a685e82945207
parent 8feea97f7c065bb9a6b77828cc1cd8e7bc4cf84d
Author: René Wagner <rwa@clttr.info>
Date: Tue, 22 Aug 2023 10:14:39 +0200
new exclude and typo-fix on robots.txt usage doc
Diffstat:
3 files changed, 45 insertions(+), 12 deletions(-)
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -135,6 +135,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://news.tuxmachines.org/",
"gemini://musicdir.zachdecook.com/",
"gemini://federal.cx/news",
+ "gemini://kypan.me/cgi",
# wikipedia proxy
"gemini://wp.pitr.ca/",
@@ -197,8 +198,10 @@ EXCLUDED_URL_PREFIXES = [
# git repos
"gemini://git.skyjake.fi",
"gemini://gemini.unlimited.pizza/git",
- # chess games or stuff
+ # games
"gemini://jsreed5.org/live/",
+ "gemini://gemini.thegonz.net/ski",
+ "gemini://gemini.thegonz.net/gemski"
]
EXCLUDED_URL_PATHS = [
diff --git a/serve/templates/documentation/indexing.gmi b/serve/templates/documentation/indexing.gmi
@@ -33,6 +33,8 @@ To control crawling of your site, you can use a robots.txt file. Place it in you
When interpreting a robots.txt, geminispace.info will use the first line that matches the URI that should be visited.
Be sure to sort your rules accordingly if you want use exhaustive rules with wildcards or the "Allow" rule that is not specificed in the companion spec.
+Your robots.txt should not include blank lines within a ruleset. Rules after a blank line will be ignored until the next ruleset is started with a "User-agent:" line.
+
geminispace.info obeys the following user-agents, listed in descending priority:
* gus
* indexer
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -90,7 +90,7 @@ class TestGeminiRobotFileParser:
return rp
def _assert_fetchable(self, rp, url="/", fetchable=True):
- useragents = ["testbot", "genericbot", "*"]
+ useragents = ["gus", "indexer", "*"]
assert rp.can_fetch_prioritized(useragents, url) == fetchable
def test_empty_robots(self):
@@ -102,19 +102,19 @@ class TestGeminiRobotFileParser:
Disallow: /""")
self._assert_fetchable(rp, "/", False)
- def test_allow_genericbot(self):
+ def test_allow_indexer(self):
rp = self._get_parser("""User-agent: *
Disallow: /
-User-agent: genericbot
+User-agent: indexer
Allow: /""")
- self._assert_fetchable(rp)
+ self._assert_fetchable(rp, "/test", True)
- def test_allow_genericbot_but_disallow_testbot(self):
- rp = self._get_parser("""User-agent: genericbot
+ def test_allow_all_but_disallow_indexer(self):
+ rp = self._get_parser("""User-agent: *
Allow: /
-User-agent: testbot
+User-agent: indexer
Disallow: /""")
self._assert_fetchable(rp, "/", False)
@@ -122,18 +122,18 @@ Disallow: /""")
rp = self._get_parser("""User-agent: *
Allow: /
-User-agent: genericbot
+User-agent: indexer
Disallow: /""")
self._assert_fetchable(rp, "/", False)
- def test_allow_only_testbot(self):
+ def test_allow_only_gus(self):
rp = self._get_parser("""User-agent: *
Disallow: /
User-agent: genericbot
Disallow: /
-User-agent: testbot
+User-agent: gus
Allow: /""")
self._assert_fetchable(rp)
@@ -162,6 +162,34 @@ Disallow: /voteru/
# Robots are forbidden to enter the infinite maze
Disallow: /maze
Disallow: /maze/
-
""")
self._assert_fetchable(rp, "/maze/l/", False)
+
+ def test_disallow_gemski_git(self):
+ rp = self._get_parser("""# disallowing because kineto at least doesn't have its own robots.txt to
+# prevent web crawling by proxy
+User-agent: webproxy
+Disallow: /
+
+User-agent: archiver
+User-agent: indexer
+User-agent: researcher
+Disallow: /gredig/
+# I'd like to just do this, but it seems at least some crawlers don't match by
+# prefix.
+#Disallow: /gemski/play
+Disallow: /ski/
+Disallow: /gemski/
+# This doesn't exist, but GUS seems to get confused with the tuner server
+# running on another port.
+Disallow: /stations/
+
+""")
+ self._assert_fetchable(rp, "/ski/sds", False)
+
+ def test_disallow_unlimitedpizza_git(self):
+ rp = self._get_parser("""User-agent: *
+Allow: /git
+Disallow: /git/dotfiles/
+""")
+ self._assert_fetchable(rp, "/git/dotfiles/tree/0b0de929fa98457d22cbbcee65013ec261b660e2/atom/packages/ex-mode/node_modules/space-pen/node_modules/grim/node_modules/emissary/node_modules/es6-weak-map/node_modules/es5-ext/math/atanh/implement.js", False)