[crawl] Add custom crawl delays - geminispace.info

commit 44f6e6250611aba9dd3557eba7326b67d4c4249e
parent 0bbf43c49ba5cccbc26d346283d66a2651261a6d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri,  5 Jun 2020 06:41:28 -0400

[crawl] Add custom crawl delays

And add the first one for alexschroeder's site, which still has a
robots.txt that doesn't parse properly.

Diffstat:
M gus/crawl.py  | 9 +++++++--

1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -102,7 +102,6 @@ EXCLUDED_URL_PREFIXES = [
     "gemini://geddit.pitr.ca/c/",
 
     # alexschroeder.ch b/c its robots.txt isn't working...
-    "gemini://alexschroeder.ch/",
     "gemini://alexschroeder.ch/map/",
     "gemini://alexschroeder.ch/do/rc",
     "gemini://alexschroeder.ch/do/rss",
@@ -120,6 +119,10 @@ EXCLUDED_URL_PATHS = [
     "rss.xml",
 ]
 
+CRAWL_DELAYS = {
+    "alexschroeder.ch": 5000,
+}
+
 
 def backup_old_index(index_dir, backup_dir):
     last_index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir))
@@ -261,7 +264,9 @@ def crawl(gemini_resource):
 
     # Crawl delay
     if gr.normalized_host in domain_hit_timings:
-        if not crawl_delay:
+        if gr.normalized_host in CRAWL_DELAYS:
+            next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=CRAWL_DELAYS[gr.normalized_host])
+        elif not crawl_delay:
             next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=500)
         else:
             next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=crawl_delay)

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE