geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

test_gemini.py (8164B)


      1 import pytest
      2 
      3 from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
      4 
      5 class TestGeminiResource:
      6     def test_extract_contained_resources(self):
      7         url = "gemini://host"
      8 
      9         # no content
     10         resources = GeminiResource(url).extract_contained_resources("")
     11         assert resources == []
     12 
     13         # not a link
     14         resources = GeminiResource(url).extract_contained_resources(" => link")
     15         assert resources == []
     16         resources = GeminiResource(url).extract_contained_resources(
     17             "```\n=> preformatted\n```"
     18         )
     19         assert resources == []
     20 
     21         # some links
     22         resources = GeminiResource(url).extract_contained_resources(
     23             "=> link\ntext\n=> other"
     24         )
     25         assert len(resources) == 2
     26         assert resources[0].raw_url == "link"
     27         assert resources[1].raw_url == "other"
     28 
     29         resources = GeminiResource(url).extract_contained_resources(
     30             """
     31 # title
     32 text
     33 => link
     34 text
     35 ``` preformatted
     36 => no link
     37 ```
     38 => other
     39             """
     40         )
     41         assert len(resources) == 2
     42         assert resources[0].raw_url == "link"
     43         assert resources[1].raw_url == "other"
     44 
     45     @pytest.mark.parametrize("test_input,expected_result", [
     46         (["gemini://gus.guru", None, None], [True, "gus.guru", "gemini://gus.guru/", "gus.guru"]),
     47         (["gemini://gus.guru/search?text", None, None], [True, "gus.guru", "gemini://gus.guru/search?text", "gus.guru"]),
     48         (["/bar", "gemini://gus.guru/foo", None], [False, None, None, None]),
     49         (["/bar", "gemini://gus.guru/foo/", None], [False, None, None, None]),
     50         (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]),
     51         (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]),
     52         (["/bar?test", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar?test", "gus.guru"]),
     53         (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]),
     54         (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/foo/bar/", "gus.guru"]),
     55         (["//foo.com", None, None], [True, "foo.com", "gemini://foo.com/", "foo.com"]),
     56         (["gemini://gem.Splatt9990.com/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com"]),
     57         (["gemini://gem.Splatt9990.com:1965/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com" ]),
     58         (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi", "gem.splatt9990.com"]),
     59         (["gemini://MichaelNordmeyer.com", None, None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/", "michaelnordmeyer.com"]),
     60         (["log.gmi", "gemini://MichaelNordmeyer.com:1965/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi", "michaelnordmeyer.com"]),
     61         (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com"]),
     62         (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com"]),
     63         (["gemini://tilde.pink/~emily/log/productivity.gmi", None, None], [True, "tilde.pink", "gemini://tilde.pink/~emily/log/productivity.gmi", "tilde.pink/~emily"]),
     64         (["gemini://tilde.pink/users/emily/index.gmi", None, None], [True, "tilde.pink", "gemini://tilde.pink/users/emily/index.gmi", "tilde.pink/users/emily"])
     65     ])
     66     def test_url_parsing(self, test_input, expected_result):
     67         gr = GeminiResource(test_input[0], test_input[1], test_input[2])
     68         assert gr.is_valid == expected_result[0]
     69         assert gr.normalized_host == expected_result[1]
     70         assert gr.fetchable_url == expected_result[2]
     71         assert gr.normalized_host_like == expected_result[3]
     72 
     73     @pytest.mark.parametrize("test_url,expected_result", [
     74         ("gemini://gus.guru", True),
     75         ("gemini://gus.guru/", True),
     76         ("gemini://gus.guru/franz", False),
     77         ("gemini://gus.guru/~franz", True),
     78         ("gemini://gus.guru/~franz/foo", False),
     79     ])
     80     def test_is_root_like(self, test_url, expected_result):
     81         gr = GeminiResource(test_url)
     82         assert gr.is_root_like == expected_result
     83 
     84 
     85 class TestGeminiRobotFileParser:
     86     def _get_parser(self, content):
     87         dummy_url = "gemini://dummy/robots.txt"
     88         rp = GeminiRobotFileParser(dummy_url)
     89         rp.read_from_string(content)
     90         return rp
     91 
     92     def _assert_fetchable(self, rp, url="/", fetchable=True):
     93         useragents = ["gus", "indexer", "*"]
     94         assert rp.can_fetch_prioritized(useragents, url) == fetchable
     95 
     96     def test_empty_robots(self):
     97         rp = self._get_parser("")
     98         self._assert_fetchable(rp)
     99 
    100     def test_disallow_star(self):
    101         rp = self._get_parser("""User-agent: *
    102 Disallow: /""")
    103         self._assert_fetchable(rp, "/", False)
    104 
    105     def test_allow_indexer(self):
    106         rp = self._get_parser("""User-agent: *
    107 Disallow: /
    108 
    109 User-agent: indexer
    110 Allow: /""")
    111         self._assert_fetchable(rp, "/test", True)
    112 
    113     def test_allow_all_but_disallow_indexer(self):
    114         rp = self._get_parser("""User-agent: *
    115 Allow: /
    116 
    117 User-agent: indexer
    118 Disallow: /""")
    119         self._assert_fetchable(rp, "/", False)
    120 
    121     def test_allow_star_but_disallow_genericbot(self):
    122         rp = self._get_parser("""User-agent: *
    123 Allow: /
    124 
    125 User-agent: indexer
    126 Disallow: /""")
    127         self._assert_fetchable(rp, "/", False)
    128 
    129     def test_allow_only_gus(self):
    130         rp = self._get_parser("""User-agent: *
    131 Disallow: /
    132 
    133 User-agent: genericbot
    134 Disallow: /
    135 
    136 User-agent: gus
    137 Allow: /""")
    138         self._assert_fetchable(rp)
    139 
    140     def test_disallow_gemidev_waffle(self):
    141         rp = self._get_parser("""user-agent: *
    142 Disallow: /cgi-bin/wp.cgi/view
    143 Disallow: /cgi-bin/wp.cgi/media
    144 Disallow: /cgi-bin/wp.cgi/search
    145 Disallow: /cgi-bin/waffle.cgi/article
    146 Disallow: /cgi-bin/waffle.cgi/feed
    147 Disallow: /cgi-bin/waffle.cgi/links
    148 Disallow: /cgi-bin/waffle.cgi/view
    149 Disallow: /cgi-bin/witw.cgi/play
    150 """)
    151         self._assert_fetchable(rp, "/cgi-bin/waffle.cgi/feed/link", False)
    152     
    153     def test_disallow_infinite_maze(self):
    154         rp = self._get_parser("""User-agent: *
    155 # We don't accept automated donations
    156 Disallow: /donate
    157 # Robots are not allowed to vote
    158 Disallow: /vote
    159 Disallow: /vote/
    160 Disallow: /voteru
    161 Disallow: /voteru/
    162 # Robots are forbidden to enter the infinite maze
    163 Disallow: /maze
    164 Disallow: /maze/
    165 """)
    166         self._assert_fetchable(rp, "/maze/l/", False)
    167 
    168     def test_disallow_gemski_git(self):
    169         rp = self._get_parser("""# disallowing because kineto at least doesn't have its own robots.txt to
    170 # prevent web crawling by proxy
    171 User-agent: webproxy
    172 Disallow: /
    173 
    174 User-agent: archiver
    175 User-agent: indexer
    176 User-agent: researcher
    177 Disallow: /gredig/
    178 # I'd like to just do this, but it seems at least some crawlers don't match by
    179 # prefix.
    180 #Disallow: /gemski/play
    181 Disallow: /ski/
    182 Disallow: /gemski/
    183 # This doesn't exist, but GUS seems to get confused with the tuner server
    184 # running on another port.
    185 Disallow: /stations/
    186 
    187 """)
    188         self._assert_fetchable(rp, "/ski/sds", False)
    189 
    190     def test_disallow_unlimitedpizza_git(self):
    191         rp = self._get_parser("""User-agent: *
    192 Allow: /git
    193 Disallow: /git/dotfiles/
    194 """)
    195         self._assert_fetchable(rp, "/git/dotfiles/tree/0b0de929fa98457d22cbbcee65013ec261b660e2/atom/packages/ex-mode/node_modules/space-pen/node_modules/grim/node_modules/emissary/node_modules/es6-weak-map/node_modules/es5-ext/math/atanh/implement.js", False)
    196 
    197     def test_disallow_libraryinured(self):
    198         rp = self._get_parser("""
    199      User-agent: indexer
    200      Disallow: /document/
    201      Disallow: /search/
    202 
    203 """)
    204         self._assert_fetchable(rp, "gemini://library.inu.red/document/saul-newman-war-on-the-state-stirner-and-deleuze-s-anarchism", False)
    205         self._assert_fetchable(rp, "gemini://library.inu.red", True)