test_gemini.py (8164B)
1 import pytest 2 3 from gus.lib.gemini import GeminiResource, GeminiRobotFileParser 4 5 class TestGeminiResource: 6 def test_extract_contained_resources(self): 7 url = "gemini://host" 8 9 # no content 10 resources = GeminiResource(url).extract_contained_resources("") 11 assert resources == [] 12 13 # not a link 14 resources = GeminiResource(url).extract_contained_resources(" => link") 15 assert resources == [] 16 resources = GeminiResource(url).extract_contained_resources( 17 "```\n=> preformatted\n```" 18 ) 19 assert resources == [] 20 21 # some links 22 resources = GeminiResource(url).extract_contained_resources( 23 "=> link\ntext\n=> other" 24 ) 25 assert len(resources) == 2 26 assert resources[0].raw_url == "link" 27 assert resources[1].raw_url == "other" 28 29 resources = GeminiResource(url).extract_contained_resources( 30 """ 31 # title 32 text 33 => link 34 text 35 ``` preformatted 36 => no link 37 ``` 38 => other 39 """ 40 ) 41 assert len(resources) == 2 42 assert resources[0].raw_url == "link" 43 assert resources[1].raw_url == "other" 44 45 @pytest.mark.parametrize("test_input,expected_result", [ 46 (["gemini://gus.guru", None, None], [True, "gus.guru", "gemini://gus.guru/", "gus.guru"]), 47 (["gemini://gus.guru/search?text", None, None], [True, "gus.guru", "gemini://gus.guru/search?text", "gus.guru"]), 48 (["/bar", "gemini://gus.guru/foo", None], [False, None, None, None]), 49 (["/bar", "gemini://gus.guru/foo/", None], [False, None, None, None]), 50 (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]), 51 (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]), 52 (["/bar?test", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar?test", "gus.guru"]), 53 (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]), 54 (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/foo/bar/", "gus.guru"]), 55 (["//foo.com", None, None], [True, "foo.com", "gemini://foo.com/", "foo.com"]), 56 (["gemini://gem.Splatt9990.com/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com"]), 57 (["gemini://gem.Splatt9990.com:1965/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com" ]), 58 (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi", "gem.splatt9990.com"]), 59 (["gemini://MichaelNordmeyer.com", None, None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/", "michaelnordmeyer.com"]), 60 (["log.gmi", "gemini://MichaelNordmeyer.com:1965/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi", "michaelnordmeyer.com"]), 61 (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com"]), 62 (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com"]), 63 (["gemini://tilde.pink/~emily/log/productivity.gmi", None, None], [True, "tilde.pink", "gemini://tilde.pink/~emily/log/productivity.gmi", "tilde.pink/~emily"]), 64 (["gemini://tilde.pink/users/emily/index.gmi", None, None], [True, "tilde.pink", "gemini://tilde.pink/users/emily/index.gmi", "tilde.pink/users/emily"]) 65 ]) 66 def test_url_parsing(self, test_input, expected_result): 67 gr = GeminiResource(test_input[0], test_input[1], test_input[2]) 68 assert gr.is_valid == expected_result[0] 69 assert gr.normalized_host == expected_result[1] 70 assert gr.fetchable_url == expected_result[2] 71 assert gr.normalized_host_like == expected_result[3] 72 73 @pytest.mark.parametrize("test_url,expected_result", [ 74 ("gemini://gus.guru", True), 75 ("gemini://gus.guru/", True), 76 ("gemini://gus.guru/franz", False), 77 ("gemini://gus.guru/~franz", True), 78 ("gemini://gus.guru/~franz/foo", False), 79 ]) 80 def test_is_root_like(self, test_url, expected_result): 81 gr = GeminiResource(test_url) 82 assert gr.is_root_like == expected_result 83 84 85 class TestGeminiRobotFileParser: 86 def _get_parser(self, content): 87 dummy_url = "gemini://dummy/robots.txt" 88 rp = GeminiRobotFileParser(dummy_url) 89 rp.read_from_string(content) 90 return rp 91 92 def _assert_fetchable(self, rp, url="/", fetchable=True): 93 useragents = ["gus", "indexer", "*"] 94 assert rp.can_fetch_prioritized(useragents, url) == fetchable 95 96 def test_empty_robots(self): 97 rp = self._get_parser("") 98 self._assert_fetchable(rp) 99 100 def test_disallow_star(self): 101 rp = self._get_parser("""User-agent: * 102 Disallow: /""") 103 self._assert_fetchable(rp, "/", False) 104 105 def test_allow_indexer(self): 106 rp = self._get_parser("""User-agent: * 107 Disallow: / 108 109 User-agent: indexer 110 Allow: /""") 111 self._assert_fetchable(rp, "/test", True) 112 113 def test_allow_all_but_disallow_indexer(self): 114 rp = self._get_parser("""User-agent: * 115 Allow: / 116 117 User-agent: indexer 118 Disallow: /""") 119 self._assert_fetchable(rp, "/", False) 120 121 def test_allow_star_but_disallow_genericbot(self): 122 rp = self._get_parser("""User-agent: * 123 Allow: / 124 125 User-agent: indexer 126 Disallow: /""") 127 self._assert_fetchable(rp, "/", False) 128 129 def test_allow_only_gus(self): 130 rp = self._get_parser("""User-agent: * 131 Disallow: / 132 133 User-agent: genericbot 134 Disallow: / 135 136 User-agent: gus 137 Allow: /""") 138 self._assert_fetchable(rp) 139 140 def test_disallow_gemidev_waffle(self): 141 rp = self._get_parser("""user-agent: * 142 Disallow: /cgi-bin/wp.cgi/view 143 Disallow: /cgi-bin/wp.cgi/media 144 Disallow: /cgi-bin/wp.cgi/search 145 Disallow: /cgi-bin/waffle.cgi/article 146 Disallow: /cgi-bin/waffle.cgi/feed 147 Disallow: /cgi-bin/waffle.cgi/links 148 Disallow: /cgi-bin/waffle.cgi/view 149 Disallow: /cgi-bin/witw.cgi/play 150 """) 151 self._assert_fetchable(rp, "/cgi-bin/waffle.cgi/feed/link", False) 152 153 def test_disallow_infinite_maze(self): 154 rp = self._get_parser("""User-agent: * 155 # We don't accept automated donations 156 Disallow: /donate 157 # Robots are not allowed to vote 158 Disallow: /vote 159 Disallow: /vote/ 160 Disallow: /voteru 161 Disallow: /voteru/ 162 # Robots are forbidden to enter the infinite maze 163 Disallow: /maze 164 Disallow: /maze/ 165 """) 166 self._assert_fetchable(rp, "/maze/l/", False) 167 168 def test_disallow_gemski_git(self): 169 rp = self._get_parser("""# disallowing because kineto at least doesn't have its own robots.txt to 170 # prevent web crawling by proxy 171 User-agent: webproxy 172 Disallow: / 173 174 User-agent: archiver 175 User-agent: indexer 176 User-agent: researcher 177 Disallow: /gredig/ 178 # I'd like to just do this, but it seems at least some crawlers don't match by 179 # prefix. 180 #Disallow: /gemski/play 181 Disallow: /ski/ 182 Disallow: /gemski/ 183 # This doesn't exist, but GUS seems to get confused with the tuner server 184 # running on another port. 185 Disallow: /stations/ 186 187 """) 188 self._assert_fetchable(rp, "/ski/sds", False) 189 190 def test_disallow_unlimitedpizza_git(self): 191 rp = self._get_parser("""User-agent: * 192 Allow: /git 193 Disallow: /git/dotfiles/ 194 """) 195 self._assert_fetchable(rp, "/git/dotfiles/tree/0b0de929fa98457d22cbbcee65013ec261b660e2/atom/packages/ex-mode/node_modules/space-pen/node_modules/grim/node_modules/emissary/node_modules/es6-weak-map/node_modules/es5-ext/math/atanh/implement.js", False) 196 197 def test_disallow_libraryinured(self): 198 rp = self._get_parser(""" 199 User-agent: indexer 200 Disallow: /document/ 201 Disallow: /search/ 202 203 """) 204 self._assert_fetchable(rp, "gemini://library.inu.red/document/saul-newman-war-on-the-state-stirner-and-deleuze-s-anarchism", False) 205 self._assert_fetchable(rp, "gemini://library.inu.red", True)