geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit e24313a14c4ad496f418f47c1cd6162373234ef3
parent c2391751e9d842431c403c8b6221827dde4e6a99
Author: René Wagner <rwa@clttr.info>
Date:   Sun, 14 Aug 2022 17:35:35 +0200

fix test and add additional test for special robots.txt

Diffstat:
Mgus/excludes.py | 2+-
Mtests/gus/lib/test_gemini.py | 15+++++++++++++++
Mtests/gus/test_crawl.py | 2--
3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/gus/excludes.py b/gus/excludes.py @@ -10,7 +10,7 @@ EXCLUDED_URL_PREFIXES = [ # all combinations of a tictactoe board "gemini://tictactoe.lanterne.chilliet.eu", - "gemini://gemi.dev/cgi-bin/waffle.cgi", + "gemini://gemi.dev/cgi-bin/", "gemini://auragem.space/texts/jewish", "gemini://auragem.space/twitch/", # serving big files and slooow capsule -> takes to long to crawl diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py @@ -128,3 +128,18 @@ Disallow: / User-agent: testbot Allow: /""") self._assert_fetchable(rp) + + def test_disallow_gemidev_waffle(self): + rp = self._get_parser("""user-agent: * +Disallow: /cgi-bin/wp.cgi/view +Disallow: /cgi-bin/wp.cgi/media +Disallow: /cgi-bin/wp.cgi/search + +Disallow: /cgi-bin/waffle.cgi/article +Disallow: /cgi-bin/waffle.cgi/feed +Disallow: /cgi-bin/waffle.cgi/links +Disallow: /cgi-bin/waffle.cgi/view + +Disallow: /cgi-bin/witw.cgi/play +""") + self._assert_fetchable(rp, "/cgi-bin/waffle.cgi/feed/link", False) diff --git a/tests/gus/test_crawl.py b/tests/gus/test_crawl.py @@ -31,8 +31,6 @@ class TestUrlExclusion: @pytest.mark.parametrize("test_url,expected_result", [ ("gemini://localhost", True), ("gemini://example.org", True), - ("gus.guru", False), - ("gus.guru/search?turkey", True), ]) def test_excluded_url_prefixes(self, test_url, expected_result): resource = GeminiResource(test_url)