commit 134b7f6c482cdfda95eae80c5d83ae1712fbacea
parent 64748f085254199e38203f84844201ac88aa27e7
Author: René Wagner <rwagner@rw-net.de>
Date: Fri, 12 Feb 2021 08:05:34 +0100
correctly handle robots.txt
Honor the robots.txt entrys of "indexer" and "gus" as well
as the default * section.
The robot_file_map.p must be deleted on a live instance
after this change has been applied to refetch all robots
files, as previously only empty files have been stored.
Diffstat:
2 files changed, 26 insertions(+), 8 deletions(-)
diff --git a/docs/handling-robots.md b/docs/handling-robots.md
@@ -0,0 +1,14 @@
+# robots.txt handling
+
+robots.txt is fetched for each (sub)domain before actually crawling the content.
+
+GUS honors the following User-agents:
+* indexer
+* gus
+* *
+
+## robots.txt caching
+
+Every fetched robots.txt is cached in `index/robot_file_map.p`, even if they were empty/missing.
+
+To force a refetch of _all_ robots.txt for _all_ capsulses, simply delete the file named above and run a crawl.
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -438,6 +438,7 @@ def fetch_robots_file(robot_host):
)
rp = GeminiRobotFileParser(robot_url)
rp.read()
+ return rp
def get_robots_file(robot_host):
@@ -489,12 +490,15 @@ def crawl_page(
robots_file = get_robots_file(gr.normalized_host)
crawl_delay = None
if robots_file is not None:
- # keep overwriting the value of can_fetch with more specific user-agent values
- # last one should win, and if not present, RobotFileParser will just return
- # the higher level's value again
- can_fetch = robots_file.can_fetch("*", gr.normalized_url)
- can_fetch = robots_file.can_fetch("indexer", gr.normalized_url)
- can_fetch = robots_file.can_fetch("gus", gr.normalized_url)
+ logging.debug("Found robots.txt for %s", gr.normalized_url)
+ # only fetch if both user-agents are allowed to fetch
+ # RobotFileParser will return the higher level value (*) if no specific
+ # value is found, but has no understanding the "gus" is a more specific
+ # form of an indexer
+ logging.debug("can_fetch indexer: %s",robots_file.can_fetch("indexer", gr.normalized_url))
+ logging.debug("can_fetch gus: %s",robots_file.can_fetch("gus", gr.normalized_url))
+ can_fetch = (robots_file.can_fetch("indexer", gr.normalized_url) and
+ robots_file.can_fetch("gus", gr.normalized_url))
# same approach as above - last value wins
crawl_delay = robots_file.crawl_delay("*")
@@ -502,8 +506,8 @@ def crawl_page(
crawl_delay = robots_file.crawl_delay("gus")
if not can_fetch:
- logging.debug(
- "Blocked by robots files, skipping: %s",
+ logging.info(
+ "Blocked by robots.txt, skipping: %s",
gus.lib.logging.strip_control_chars(url),
)
return