commit 8520ec533ce63a745c5dbb1bafc5c23722244f94
parent 134b7f6c482cdfda95eae80c5d83ae1712fbacea
Author: René Wagner <rwagner@rw-net.de>
Date: Mon, 22 Feb 2021 19:06:02 +0100
robots.txt sections "*" and "indexer" are honored
We no longer use the "gus" section for ease of implementation.
It's probably barely used anyway.
Diffstat:
2 files changed, 4 insertions(+), 13 deletions(-)
diff --git a/docs/handling-robots.md b/docs/handling-robots.md
@@ -4,7 +4,6 @@ robots.txt is fetched for each (sub)domain before actually crawling the content.
GUS honors the following User-agents:
* indexer
-* gus
* *
## robots.txt caching
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -491,19 +491,11 @@ def crawl_page(
crawl_delay = None
if robots_file is not None:
logging.debug("Found robots.txt for %s", gr.normalized_url)
- # only fetch if both user-agents are allowed to fetch
- # RobotFileParser will return the higher level value (*) if no specific
- # value is found, but has no understanding the "gus" is a more specific
- # form of an indexer
- logging.debug("can_fetch indexer: %s",robots_file.can_fetch("indexer", gr.normalized_url))
- logging.debug("can_fetch gus: %s",robots_file.can_fetch("gus", gr.normalized_url))
- can_fetch = (robots_file.can_fetch("indexer", gr.normalized_url) and
- robots_file.can_fetch("gus", gr.normalized_url))
-
- # same approach as above - last value wins
- crawl_delay = robots_file.crawl_delay("*")
+ # only fetch if allowed for user-agents * and indexer
+ # RobotFileParser will return the higher level value (*) if
+ # no indexer section is found
+ can_fetch = robots_file.can_fetch("indexer", gr.normalized_url)
crawl_delay = robots_file.crawl_delay("indexer")
- crawl_delay = robots_file.crawl_delay("gus")
if not can_fetch:
logging.info(