commit f6bd88672ea163d77bc3460ce460afb29b9c5468
parent 1ce3f6f92b203861022554d4eac806f32ecf16c7
Author: Hannu Hartikainen <hannu@hrtk.in>
Date: Sat, 17 Jul 2021 12:06:19 +0300
support prioritized robots.txt user-agents
Reimplement the can_fetch() function of RobotFileParser such that it
prioritizes multiple user-agents. Add unit test for said functionality
and set the user-agents this crawler uses to ["gus", "indexer", "*"] (as
they were in the past, though with bugs).
This was heavily inspired by the earlier discussion at
https://lists.sr.ht/~natpen/gus/%3C20210212070534.14511-1-rwagner%40rw-net.de%3E
Diffstat:
3 files changed, 102 insertions(+), 5 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -364,10 +364,9 @@ def crawl_page(
crawl_delay = None
if robots_file is not None:
logging.debug("Found robots.txt for %s", gr.normalized_url)
- # only fetch if allowed for user-agents * and indexer
- # RobotFileParser will return the higher level value (*) if
- # no indexer section is found
- can_fetch = robots_file.can_fetch("indexer", gr.normalized_url)
+ # only fetch if allowed for a matching user-agent:
+ # in priority order "gus" > "indexer" > "*"
+ can_fetch = robots_file.can_fetch_prioritized(["gus", "indexer", "*"], gr.normalized_url)
# same approach as above - last value wins
# crawl_delay = robots_file.crawl_delay("indexer")
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -1,8 +1,11 @@
import re
from urllib.parse import (
+ quote,
unquote,
urljoin,
+ urlparse,
urlsplit,
+ urlunparse,
urlunsplit,
uses_relative,
uses_netloc,
@@ -78,6 +81,45 @@ class GeminiRobotFileParser(RobotFileParser):
else:
self.parse(response.content.splitlines())
+ def read_from_string(self, robots_txt):
+ """An utility method for writing tests"""
+ self.parse(robots_txt.splitlines())
+
+ def can_fetch_prioritized(self, useragents, url):
+ """Given a url and prioritized list of user-agents, is fetching allowed?
+
+ Priority is with the highest priority first; eg. ["ThisIndexerBot", "generic-indexer", "generic-bot", "*"].
+ """
+ if self.allow_all:
+ return True
+ if self.disallow_all:
+ return False
+
+ if not self.last_checked:
+ return False
+
+ parsed_url = urlparse(unquote(url))
+ url = urlunparse(('','',parsed_url.path, parsed_url.params,parsed_url.query, parsed_url.fragment))
+ url = quote(url) or "/"
+
+ def useragent_allowed(useragent):
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.allowance(url)
+ return None
+
+ # map user-agents to allowances; the first non-None will be the prioritized allowance
+ for ua in useragents:
+ allowed = useragent_allowed(ua)
+ if allowed is not None:
+ return allowed
+
+ # if none of the user-agents match, check default entry
+ if self.default_entry:
+ return self.default_entry.allowance(url)
+
+ # if nothing matches, crawling is allowed
+ return True
class GeminiResource:
def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None):
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -1,6 +1,6 @@
import pytest
-from gus.lib.gemini import GeminiResource
+from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
class TestGeminiResource:
def test_extract_contained_resources(self):
@@ -72,3 +72,59 @@ text
def test_is_root_like(self, test_url, expected_result):
gr = GeminiResource(test_url)
assert gr.is_root_like == expected_result
+
+
+class TestGeminiRobotFileParser:
+ def _get_parser(self, content):
+ dummy_url = "gemini://dummy/robots.txt"
+ rp = GeminiRobotFileParser(dummy_url)
+ rp.read_from_string(content)
+ return rp
+
+ def _assert_fetchable(self, rp, url="/", fetchable=True):
+ useragents = ["testbot", "genericbot", "*"]
+ assert rp.can_fetch_prioritized(useragents, url) == fetchable
+
+ def test_empty_robots(self):
+ rp = self._get_parser("")
+ self._assert_fetchable(rp)
+
+ def test_disallow_star(self):
+ rp = self._get_parser("""User-agent: *
+Disallow: /""")
+ self._assert_fetchable(rp, "/", False)
+
+ def test_allow_genericbot(self):
+ rp = self._get_parser("""User-agent: *
+Disallow: /
+
+User-agent: genericbot
+Allow: /""")
+ self._assert_fetchable(rp)
+
+ def test_allow_genericbot_but_disallow_testbot(self):
+ rp = self._get_parser("""User-agent: genericbot
+Allow: /
+
+User-agent: testbot
+Disallow: /""")
+ self._assert_fetchable(rp, "/", False)
+
+ def test_allow_star_but_disallow_genericbot(self):
+ rp = self._get_parser("""User-agent: *
+Allow: /
+
+User-agent: genericbot
+Disallow: /""")
+ self._assert_fetchable(rp, "/", False)
+
+ def test_allow_only_testbot(self):
+ rp = self._get_parser("""User-agent: *
+Disallow: /
+
+User-agent: genericbot
+Disallow: /
+
+User-agent: testbot
+Allow: /""")
+ self._assert_fetchable(rp)