commit 44f6e6250611aba9dd3557eba7326b67d4c4249e
parent 0bbf43c49ba5cccbc26d346283d66a2651261a6d
Author: Natalie Pendragon <natpen@natpen.net>
Date: Fri, 5 Jun 2020 06:41:28 -0400
[crawl] Add custom crawl delays
And add the first one for alexschroeder's site, which still has a
robots.txt that doesn't parse properly.
Diffstat:
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -102,7 +102,6 @@ EXCLUDED_URL_PREFIXES = [
"gemini://geddit.pitr.ca/c/",
# alexschroeder.ch b/c its robots.txt isn't working...
- "gemini://alexschroeder.ch/",
"gemini://alexschroeder.ch/map/",
"gemini://alexschroeder.ch/do/rc",
"gemini://alexschroeder.ch/do/rss",
@@ -120,6 +119,10 @@ EXCLUDED_URL_PATHS = [
"rss.xml",
]
+CRAWL_DELAYS = {
+ "alexschroeder.ch": 5000,
+}
+
def backup_old_index(index_dir, backup_dir):
last_index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir))
@@ -261,7 +264,9 @@ def crawl(gemini_resource):
# Crawl delay
if gr.normalized_host in domain_hit_timings:
- if not crawl_delay:
+ if gr.normalized_host in CRAWL_DELAYS:
+ next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=CRAWL_DELAYS[gr.normalized_host])
+ elif not crawl_delay:
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=500)
else:
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=crawl_delay)