geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit f6bd88672ea163d77bc3460ce460afb29b9c5468
parent 1ce3f6f92b203861022554d4eac806f32ecf16c7
Author: Hannu Hartikainen <hannu@hrtk.in>
Date:   Sat, 17 Jul 2021 12:06:19 +0300

support prioritized robots.txt user-agents

Reimplement the can_fetch() function of RobotFileParser such that it
prioritizes multiple user-agents. Add unit test for said functionality
and set the user-agents this crawler uses to ["gus", "indexer", "*"] (as
they were in the past, though with bugs).

This was heavily inspired by the earlier discussion at
https://lists.sr.ht/~natpen/gus/%3C20210212070534.14511-1-rwagner%40rw-net.de%3E

Diffstat:
Mgus/crawl.py | 7+++----
Mgus/lib/gemini.py | 42++++++++++++++++++++++++++++++++++++++++++
Mtests/gus/lib/test_gemini.py | 58+++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -364,10 +364,9 @@ def crawl_page( crawl_delay = None if robots_file is not None: logging.debug("Found robots.txt for %s", gr.normalized_url) - # only fetch if allowed for user-agents * and indexer - # RobotFileParser will return the higher level value (*) if - # no indexer section is found - can_fetch = robots_file.can_fetch("indexer", gr.normalized_url) + # only fetch if allowed for a matching user-agent: + # in priority order "gus" > "indexer" > "*" + can_fetch = robots_file.can_fetch_prioritized(["gus", "indexer", "*"], gr.normalized_url) # same approach as above - last value wins # crawl_delay = robots_file.crawl_delay("indexer") diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -1,8 +1,11 @@ import re from urllib.parse import ( + quote, unquote, urljoin, + urlparse, urlsplit, + urlunparse, urlunsplit, uses_relative, uses_netloc, @@ -78,6 +81,45 @@ class GeminiRobotFileParser(RobotFileParser): else: self.parse(response.content.splitlines()) + def read_from_string(self, robots_txt): + """An utility method for writing tests""" + self.parse(robots_txt.splitlines()) + + def can_fetch_prioritized(self, useragents, url): + """Given a url and prioritized list of user-agents, is fetching allowed? + + Priority is with the highest priority first; eg. ["ThisIndexerBot", "generic-indexer", "generic-bot", "*"]. + """ + if self.allow_all: + return True + if self.disallow_all: + return False + + if not self.last_checked: + return False + + parsed_url = urlparse(unquote(url)) + url = urlunparse(('','',parsed_url.path, parsed_url.params,parsed_url.query, parsed_url.fragment)) + url = quote(url) or "/" + + def useragent_allowed(useragent): + for entry in self.entries: + if entry.applies_to(useragent): + return entry.allowance(url) + return None + + # map user-agents to allowances; the first non-None will be the prioritized allowance + for ua in useragents: + allowed = useragent_allowed(ua) + if allowed is not None: + return allowed + + # if none of the user-agents match, check default entry + if self.default_entry: + return self.default_entry.allowance(url) + + # if nothing matches, crawling is allowed + return True class GeminiResource: def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None): diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py @@ -1,6 +1,6 @@ import pytest -from gus.lib.gemini import GeminiResource +from gus.lib.gemini import GeminiResource, GeminiRobotFileParser class TestGeminiResource: def test_extract_contained_resources(self): @@ -72,3 +72,59 @@ text def test_is_root_like(self, test_url, expected_result): gr = GeminiResource(test_url) assert gr.is_root_like == expected_result + + +class TestGeminiRobotFileParser: + def _get_parser(self, content): + dummy_url = "gemini://dummy/robots.txt" + rp = GeminiRobotFileParser(dummy_url) + rp.read_from_string(content) + return rp + + def _assert_fetchable(self, rp, url="/", fetchable=True): + useragents = ["testbot", "genericbot", "*"] + assert rp.can_fetch_prioritized(useragents, url) == fetchable + + def test_empty_robots(self): + rp = self._get_parser("") + self._assert_fetchable(rp) + + def test_disallow_star(self): + rp = self._get_parser("""User-agent: * +Disallow: /""") + self._assert_fetchable(rp, "/", False) + + def test_allow_genericbot(self): + rp = self._get_parser("""User-agent: * +Disallow: / + +User-agent: genericbot +Allow: /""") + self._assert_fetchable(rp) + + def test_allow_genericbot_but_disallow_testbot(self): + rp = self._get_parser("""User-agent: genericbot +Allow: / + +User-agent: testbot +Disallow: /""") + self._assert_fetchable(rp, "/", False) + + def test_allow_star_but_disallow_genericbot(self): + rp = self._get_parser("""User-agent: * +Allow: / + +User-agent: genericbot +Disallow: /""") + self._assert_fetchable(rp, "/", False) + + def test_allow_only_testbot(self): + rp = self._get_parser("""User-agent: * +Disallow: / + +User-agent: genericbot +Disallow: / + +User-agent: testbot +Allow: /""") + self._assert_fetchable(rp)