commit 62002f4b7181d0b5b4da761b03b4dea513213206
parent 3679863e4070931a32c9be7519707f2dbb4c25d5
Author: Natalie Pendragon <natpen@natpen.net>
Date: Thu, 16 Apr 2020 18:19:07 -0400
[crawl] fix crawl bug with robots.txt
Diffstat:
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -34,7 +34,7 @@ SEED_URLS = [
"gemini://dump.royniang.com",
"gemini://konpeito.media",
"gemini://gemini.68kmentat.com",
- # "gemini://envs.net",
+ "gemini://envs.net",
]
@@ -130,7 +130,7 @@ def index_content(response):
def get_robots_file(url):
- robot_host = url.split(":1965", 1)[0] + ":1965"
+ robot_host = 'gemini://' + urlparse(url, 'gemini').hostname
if robot_host not in robot_file_map:
print(f"Requesting robots.txt for {robot_host}")
robot_url = robot_host + "/robots.txt"