commit 80ad98435233fab6ae7642bb603353d01b577f2b
parent b8f73c5617bd502d6d952b2714eee404d8132eca
Author: Natalie Pendragon <natpen@natpen.net>
Date: Fri, 21 Feb 2020 08:44:01 -0500
Respect robots.txt
Diffstat:
2 files changed, 41 insertions(+), 10 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -1,6 +1,7 @@
import pathlib
import re
import shutil
+from urllib import robotparser
from urllib.parse import urlparse, urlunparse
import gusmobile as gemini
@@ -9,7 +10,7 @@ from whoosh.index import create_in
INDEX_DIR = "index"
-KNOWN_URLS = [
+SEED_URLS = [
"gemini://gemini.conman.org/",
"gemini://zaibatsu.circumlunar.space:1965",
"gemini://carcosa.net:1965",
@@ -72,10 +73,26 @@ def index_content(response):
print("INDEXING...")
index_writer.add_document(
url=response.url,
- content=response.content,
+ content=response.url + " " + response.content,
)
+def get_robots_file(url):
+ robot_host = url.split(":1965", 1)[0] + ":1965"
+ if robot_host not in robot_file_map:
+ print(f"Requesting robots.txt for {robot_host}")
+ robot_url = robot_host + "/robots.txt"
+ r = gemini.fetch(robot_url)
+ if r is not None and r.status.startswith("2"):
+ robot_file_parser = robotparser.RobotFileParser()
+ robot_file_parser.parse(r.content)
+ print(robot_file_parser.site_maps())
+ robot_file_map[robot_host] = robot_file_parser
+ else:
+ robot_file_map[robot_host] = None
+ return robot_file_map[robot_host]
+
+
def crawl_url(url):
u = urlparse(url, 'gemini')
url = urlunparse(u)
@@ -85,6 +102,16 @@ def crawl_url(url):
print("--------------------------")
return
normalized_url = normalize_gemini_url(url)
+ if normalized_url.startswith("gemini://example.org"):
+ return
+ robots_file = get_robots_file(normalized_url)
+ if robots_file is not None:
+ can_fetch = robots_file.can_fetch("gus", normalized_url) and robots_file.can_fetch("*", normalized_url)
+ if not can_fetch:
+ print("ROBOTS SKIP : %s" % url)
+ print("--------------------------")
+ return
+
if normalized_url in visited_urls:
print("ALREADY SEEN : %s" % url)
print("--------------------------")
@@ -114,7 +141,7 @@ def crawl_url(url):
else:
# input, error, etc (all other statuses)
print("UNHANDLED : %s" % url)
- print("--------------------------")
+ print("--------------------------")
def main():
@@ -122,8 +149,10 @@ def main():
index_writer = create_index(INDEX_DIR)
global visited_urls
visited_urls = []
+ global robot_file_map
+ robot_file_map = {}
try:
- for url in KNOWN_URLS:
+ for url in SEED_URLS:
crawl_url(url)
finally:
index_writer.commit()
diff --git a/gus/serve.py b/gus/serve.py
@@ -32,11 +32,10 @@ def _render_footer():
def index(request):
data = _render_header()
data.extend([
- "Welcome to GUS, the Gemini search engine!",
+ "Welcome to GUS, a Gemini search engine!",
"",
"=> /search Search GUS",
- "=> /known-servers List of known Gemini servers",
- "=> /about About the indexer",
+ "=> /about About GUS",
"=> gemini://gemini.circumlunar.space Gemini Project information"
])
data.extend(_render_footer())
@@ -56,14 +55,17 @@ def index(request):
"directory such that a request for \"robots.txt\" will",
"fetch it.",
"",
- "GUS obeys User-agent of \"gus\" and \"*\"."
+ "GUS obeys User-agent of \"gus\" and \"*\".",
+ "",
+ "If you encounter problems with GUS, or have ideas",
+ "for its future, please email me at natpen@natpen.net.",
])
data.extend(_render_footer())
return Response(Status.SUCCESS, "text/gemini", "\n".join(data))
def _search_index(query):
- query = QueryParser("content", ix.schema).parse(query)
+ query = QueryParser("content", ix.schema).parse(query)
results = searcher.search(query)
return (
len(results),
@@ -80,7 +82,7 @@ def _render_results(results):
data.append("=> {}".format(result[1]))
return data
-
+
def _render_results_header(query, num_results):
return [
"| You searched for: \"{}\"".format(query),