geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 44f6e6250611aba9dd3557eba7326b67d4c4249e
parent 0bbf43c49ba5cccbc26d346283d66a2651261a6d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri,  5 Jun 2020 06:41:28 -0400

[crawl] Add custom crawl delays

And add the first one for alexschroeder's site, which still has a
robots.txt that doesn't parse properly.

Diffstat:
Mgus/crawl.py | 9+++++++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -102,7 +102,6 @@ EXCLUDED_URL_PREFIXES = [ "gemini://geddit.pitr.ca/c/", # alexschroeder.ch b/c its robots.txt isn't working... - "gemini://alexschroeder.ch/", "gemini://alexschroeder.ch/map/", "gemini://alexschroeder.ch/do/rc", "gemini://alexschroeder.ch/do/rss", @@ -120,6 +119,10 @@ EXCLUDED_URL_PATHS = [ "rss.xml", ] +CRAWL_DELAYS = { + "alexschroeder.ch": 5000, +} + def backup_old_index(index_dir, backup_dir): last_index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir)) @@ -261,7 +264,9 @@ def crawl(gemini_resource): # Crawl delay if gr.normalized_host in domain_hit_timings: - if not crawl_delay: + if gr.normalized_host in CRAWL_DELAYS: + next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=CRAWL_DELAYS[gr.normalized_host]) + elif not crawl_delay: next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=500) else: next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=crawl_delay)