geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 62002f4b7181d0b5b4da761b03b4dea513213206
parent 3679863e4070931a32c9be7519707f2dbb4c25d5
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 16 Apr 2020 18:19:07 -0400

[crawl] fix crawl bug with robots.txt

Diffstat:
Mgus/crawl.py | 4++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -34,7 +34,7 @@ SEED_URLS = [ "gemini://dump.royniang.com", "gemini://konpeito.media", "gemini://gemini.68kmentat.com", - # "gemini://envs.net", + "gemini://envs.net", ] @@ -130,7 +130,7 @@ def index_content(response): def get_robots_file(url): - robot_host = url.split(":1965", 1)[0] + ":1965" + robot_host = 'gemini://' + urlparse(url, 'gemini').hostname if robot_host not in robot_file_map: print(f"Requesting robots.txt for {robot_host}") robot_url = robot_host + "/robots.txt"