new exclude and typo-fix on robots.txt usage doc - geminispace.info

commit e103214e870beefeb2e219c0104a685e82945207
parent 8feea97f7c065bb9a6b77828cc1cd8e7bc4cf84d
Author: René Wagner <rwa@clttr.info>
Date:   Tue, 22 Aug 2023 10:14:39 +0200

new exclude and typo-fix on robots.txt usage doc

Diffstat:
M gus/excludes.py  | 5 ++++-
M serve/templates/documentation/indexing.gmi  | 2 ++
M tests/gus/lib/test_gemini.py  | 50 +++++++++++++++++++++++++++++++++++++++-----------

3 files changed, 45 insertions(+), 12 deletions(-)
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -135,6 +135,7 @@ EXCLUDED_URL_PREFIXES = [
     "gemini://news.tuxmachines.org/",
     "gemini://musicdir.zachdecook.com/",
     "gemini://federal.cx/news",
+    "gemini://kypan.me/cgi",
     
 	# wikipedia proxy
     "gemini://wp.pitr.ca/",
@@ -197,8 +198,10 @@ EXCLUDED_URL_PREFIXES = [
     # git repos
     "gemini://git.skyjake.fi",
     "gemini://gemini.unlimited.pizza/git",
-    # chess games or stuff
+    # games
     "gemini://jsreed5.org/live/",
+    "gemini://gemini.thegonz.net/ski",
+    "gemini://gemini.thegonz.net/gemski"
 ]
 
 EXCLUDED_URL_PATHS = [
diff --git a/serve/templates/documentation/indexing.gmi b/serve/templates/documentation/indexing.gmi
@@ -33,6 +33,8 @@ To control crawling of your site, you can use a robots.txt file. Place it in you
 When interpreting a robots.txt, geminispace.info will use the first line that matches the URI that should be visited.
 Be sure to sort your rules accordingly if you want use exhaustive rules with wildcards or the "Allow" rule that is not specificed in the companion spec. 
 
+Your robots.txt should not include blank lines within a ruleset. Rules after a blank line will be ignored until the next ruleset is started with a "User-agent:" line.
+
 geminispace.info obeys the following user-agents, listed in descending priority:
 * gus
 * indexer 
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -90,7 +90,7 @@ class TestGeminiRobotFileParser:
         return rp
 
     def _assert_fetchable(self, rp, url="/", fetchable=True):
-        useragents = ["testbot", "genericbot", "*"]
+        useragents = ["gus", "indexer", "*"]
         assert rp.can_fetch_prioritized(useragents, url) == fetchable
 
     def test_empty_robots(self):
@@ -102,19 +102,19 @@ class TestGeminiRobotFileParser:
 Disallow: /""")
         self._assert_fetchable(rp, "/", False)
 
-    def test_allow_genericbot(self):
+    def test_allow_indexer(self):
         rp = self._get_parser("""User-agent: *
 Disallow: /
 
-User-agent: genericbot
+User-agent: indexer
 Allow: /""")
-        self._assert_fetchable(rp)
+        self._assert_fetchable(rp, "/test", True)
 
-    def test_allow_genericbot_but_disallow_testbot(self):
-        rp = self._get_parser("""User-agent: genericbot
+    def test_allow_all_but_disallow_indexer(self):
+        rp = self._get_parser("""User-agent: *
 Allow: /
 
-User-agent: testbot
+User-agent: indexer
 Disallow: /""")
         self._assert_fetchable(rp, "/", False)
 
@@ -122,18 +122,18 @@ Disallow: /""")
         rp = self._get_parser("""User-agent: *
 Allow: /
 
-User-agent: genericbot
+User-agent: indexer
 Disallow: /""")
         self._assert_fetchable(rp, "/", False)
 
-    def test_allow_only_testbot(self):
+    def test_allow_only_gus(self):
         rp = self._get_parser("""User-agent: *
 Disallow: /
 
 User-agent: genericbot
 Disallow: /
 
-User-agent: testbot
+User-agent: gus
 Allow: /""")
         self._assert_fetchable(rp)
 
@@ -162,6 +162,34 @@ Disallow: /voteru/
 # Robots are forbidden to enter the infinite maze
 Disallow: /maze
 Disallow: /maze/
-
 """)
         self._assert_fetchable(rp, "/maze/l/", False)
+
+    def test_disallow_gemski_git(self):
+        rp = self._get_parser("""# disallowing because kineto at least doesn't have its own robots.txt to
+# prevent web crawling by proxy
+User-agent: webproxy
+Disallow: /
+
+User-agent: archiver
+User-agent: indexer
+User-agent: researcher
+Disallow: /gredig/
+# I'd like to just do this, but it seems at least some crawlers don't match by
+# prefix.
+#Disallow: /gemski/play
+Disallow: /ski/
+Disallow: /gemski/
+# This doesn't exist, but GUS seems to get confused with the tuner server
+# running on another port.
+Disallow: /stations/
+
+""")
+        self._assert_fetchable(rp, "/ski/sds", False)
+
+    def test_disallow_unlimitedpizza_git(self):
+        rp = self._get_parser("""User-agent: *
+Allow: /git
+Disallow: /git/dotfiles/
+""")
+        self._assert_fetchable(rp, "/git/dotfiles/tree/0b0de929fa98457d22cbbcee65013ec261b660e2/atom/packages/ex-mode/node_modules/space-pen/node_modules/grim/node_modules/emissary/node_modules/es6-weak-map/node_modules/es5-ext/math/atanh/implement.js", False)

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/excludes.py	\|	5	++++-
M	serve/templates/documentation/indexing.gmi	\|	2	++
M	tests/gus/lib/test_gemini.py	\|	50	+++++++++++++++++++++++++++++++++++++++-----------