support prioritized robots.txt user-agents - geminispace.info

commit f6bd88672ea163d77bc3460ce460afb29b9c5468
parent 1ce3f6f92b203861022554d4eac806f32ecf16c7
Author: Hannu Hartikainen <hannu@hrtk.in>
Date:   Sat, 17 Jul 2021 12:06:19 +0300

support prioritized robots.txt user-agents

Reimplement the can_fetch() function of RobotFileParser such that it
prioritizes multiple user-agents. Add unit test for said functionality
and set the user-agents this crawler uses to ["gus", "indexer", "*"] (as
they were in the past, though with bugs).

This was heavily inspired by the earlier discussion at
https://lists.sr.ht/~natpen/gus/%3C20210212070534.14511-1-rwagner%40rw-net.de%3E

Diffstat:
M gus/crawl.py  | 7 +++----
M gus/lib/gemini.py  | 42 ++++++++++++++++++++++++++++++++++++++++++
M tests/gus/lib/test_gemini.py  | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-

3 files changed, 102 insertions(+), 5 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -364,10 +364,9 @@ def crawl_page(
     crawl_delay = None
     if robots_file is not None:
         logging.debug("Found robots.txt for %s", gr.normalized_url)
-        # only fetch if allowed for user-agents * and indexer 
-        # RobotFileParser will return the higher level value (*) if 
-        # no indexer section is found
-        can_fetch = robots_file.can_fetch("indexer", gr.normalized_url)
+        # only fetch if allowed for a matching user-agent:
+        # in priority order "gus" > "indexer" > "*"
+        can_fetch = robots_file.can_fetch_prioritized(["gus", "indexer", "*"], gr.normalized_url)
 
         # same approach as above - last value wins
 #        crawl_delay = robots_file.crawl_delay("indexer")
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -1,8 +1,11 @@
 import re
 from urllib.parse import (
+    quote,
     unquote,
     urljoin,
+    urlparse,
     urlsplit,
+    urlunparse,
     urlunsplit,
     uses_relative,
     uses_netloc,
@@ -78,6 +81,45 @@ class GeminiRobotFileParser(RobotFileParser):
         else:
             self.parse(response.content.splitlines())
 
+    def read_from_string(self, robots_txt):
+        """An utility method for writing tests"""
+        self.parse(robots_txt.splitlines())
+
+    def can_fetch_prioritized(self, useragents, url):
+        """Given a url and prioritized list of user-agents, is fetching allowed?
+
+        Priority is with the highest priority first; eg. ["ThisIndexerBot", "generic-indexer", "generic-bot", "*"].
+        """
+        if self.allow_all:
+            return True
+        if self.disallow_all:
+            return False
+
+        if not self.last_checked:
+            return False
+
+        parsed_url = urlparse(unquote(url))
+        url = urlunparse(('','',parsed_url.path, parsed_url.params,parsed_url.query, parsed_url.fragment))
+        url = quote(url) or "/"
+
+        def useragent_allowed(useragent):
+            for entry in self.entries:
+                if entry.applies_to(useragent):
+                    return entry.allowance(url)
+            return None
+
+        # map user-agents to allowances; the first non-None will be the prioritized allowance
+        for ua in useragents:
+            allowed = useragent_allowed(ua)
+            if allowed is not None:
+                return allowed
+
+        # if none of the user-agents match, check default entry
+        if self.default_entry:
+            return self.default_entry.allowance(url)
+
+        # if nothing matches, crawling is allowed
+        return True
 
 class GeminiResource:
     def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None):
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -1,6 +1,6 @@
 import pytest
 
-from gus.lib.gemini import GeminiResource
+from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
 
 class TestGeminiResource:
     def test_extract_contained_resources(self):
@@ -72,3 +72,59 @@ text
     def test_is_root_like(self, test_url, expected_result):
         gr = GeminiResource(test_url)
         assert gr.is_root_like == expected_result
+
+
+class TestGeminiRobotFileParser:
+    def _get_parser(self, content):
+        dummy_url = "gemini://dummy/robots.txt"
+        rp = GeminiRobotFileParser(dummy_url)
+        rp.read_from_string(content)
+        return rp
+
+    def _assert_fetchable(self, rp, url="/", fetchable=True):
+        useragents = ["testbot", "genericbot", "*"]
+        assert rp.can_fetch_prioritized(useragents, url) == fetchable
+
+    def test_empty_robots(self):
+        rp = self._get_parser("")
+        self._assert_fetchable(rp)
+
+    def test_disallow_star(self):
+        rp = self._get_parser("""User-agent: *
+Disallow: /""")
+        self._assert_fetchable(rp, "/", False)
+
+    def test_allow_genericbot(self):
+        rp = self._get_parser("""User-agent: *
+Disallow: /
+
+User-agent: genericbot
+Allow: /""")
+        self._assert_fetchable(rp)
+
+    def test_allow_genericbot_but_disallow_testbot(self):
+        rp = self._get_parser("""User-agent: genericbot
+Allow: /
+
+User-agent: testbot
+Disallow: /""")
+        self._assert_fetchable(rp, "/", False)
+
+    def test_allow_star_but_disallow_genericbot(self):
+        rp = self._get_parser("""User-agent: *
+Allow: /
+
+User-agent: genericbot
+Disallow: /""")
+        self._assert_fetchable(rp, "/", False)
+
+    def test_allow_only_testbot(self):
+        rp = self._get_parser("""User-agent: *
+Disallow: /
+
+User-agent: genericbot
+Disallow: /
+
+User-agent: testbot
+Allow: /""")
+        self._assert_fetchable(rp)

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	7	+++----
M	gus/lib/gemini.py	\|	42	++++++++++++++++++++++++++++++++++++++++++
M	tests/gus/lib/test_gemini.py	\|	58	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++-