robots.txt sections "*" and "indexer" are honored - geminispace.info

commit 8520ec533ce63a745c5dbb1bafc5c23722244f94
parent 134b7f6c482cdfda95eae80c5d83ae1712fbacea
Author: René Wagner <rwagner@rw-net.de>
Date:   Mon, 22 Feb 2021 19:06:02 +0100

robots.txt sections "*" and "indexer" are honored

We no longer use the "gus" section for ease of implementation.
It's probably barely used anyway.

Diffstat:
M docs/handling-robots.md  | 1 -
M gus/crawl.py  | 16 ++++------------

2 files changed, 4 insertions(+), 13 deletions(-)
diff --git a/docs/handling-robots.md b/docs/handling-robots.md
@@ -4,7 +4,6 @@ robots.txt is fetched for each (sub)domain before actually crawling the content.
 
 GUS honors the following User-agents:
 * indexer
-* gus
 * *
 
 ## robots.txt caching
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -491,19 +491,11 @@ def crawl_page(
     crawl_delay = None
     if robots_file is not None:
         logging.debug("Found robots.txt for %s", gr.normalized_url)
-        # only fetch if both user-agents are allowed to fetch
-        # RobotFileParser will return the higher level value (*) if no specific
-        # value is found, but has no understanding the "gus" is a more specific
-        # form of an indexer
-        logging.debug("can_fetch indexer: %s",robots_file.can_fetch("indexer", gr.normalized_url))
-        logging.debug("can_fetch gus: %s",robots_file.can_fetch("gus", gr.normalized_url))
-        can_fetch = (robots_file.can_fetch("indexer", gr.normalized_url) and
-            robots_file.can_fetch("gus", gr.normalized_url))
-
-        # same approach as above - last value wins
-        crawl_delay = robots_file.crawl_delay("*")
+        # only fetch if allowed for user-agents * and indexer 
+        # RobotFileParser will return the higher level value (*) if 
+        # no indexer section is found
+        can_fetch = robots_file.can_fetch("indexer", gr.normalized_url)
         crawl_delay = robots_file.crawl_delay("indexer")
-        crawl_delay = robots_file.crawl_delay("gus")
 
         if not can_fetch:
             logging.info(

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE