Respect robots.txt - geminispace.info

commit 80ad98435233fab6ae7642bb603353d01b577f2b
parent b8f73c5617bd502d6d952b2714eee404d8132eca
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri, 21 Feb 2020 08:44:01 -0500

Respect robots.txt

Diffstat:
M gus/crawl.py  | 37 +++++++++++++++++++++++++++++++++----
M gus/serve.py  | 14 ++++++++------

2 files changed, 41 insertions(+), 10 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -1,6 +1,7 @@
 import pathlib
 import re
 import shutil
+from urllib import robotparser
 from urllib.parse import urlparse, urlunparse
 
 import gusmobile as gemini
@@ -9,7 +10,7 @@ from whoosh.index import create_in
 
 INDEX_DIR = "index"
 
-KNOWN_URLS = [
+SEED_URLS = [
     "gemini://gemini.conman.org/",
     "gemini://zaibatsu.circumlunar.space:1965",
     "gemini://carcosa.net:1965",
@@ -72,10 +73,26 @@ def index_content(response):
     print("INDEXING...")
     index_writer.add_document(
         url=response.url,
-        content=response.content,
+        content=response.url + " " + response.content,
     )
 
 
+def get_robots_file(url):
+    robot_host = url.split(":1965", 1)[0] + ":1965"
+    if robot_host not in robot_file_map:
+        print(f"Requesting robots.txt for {robot_host}")
+        robot_url = robot_host + "/robots.txt"
+        r = gemini.fetch(robot_url)
+        if r is not None and r.status.startswith("2"):
+            robot_file_parser = robotparser.RobotFileParser()
+            robot_file_parser.parse(r.content)
+            print(robot_file_parser.site_maps())
+            robot_file_map[robot_host] = robot_file_parser
+        else:
+            robot_file_map[robot_host] = None
+    return robot_file_map[robot_host]
+
+
 def crawl_url(url):
     u = urlparse(url, 'gemini')
     url = urlunparse(u)
@@ -85,6 +102,16 @@ def crawl_url(url):
         print("--------------------------")
         return
     normalized_url = normalize_gemini_url(url)
+    if normalized_url.startswith("gemini://example.org"):
+        return
+    robots_file = get_robots_file(normalized_url)
+    if robots_file is not None:
+        can_fetch = robots_file.can_fetch("gus", normalized_url) and robots_file.can_fetch("*", normalized_url)
+        if not can_fetch:
+            print("ROBOTS SKIP  : %s" % url)
+            print("--------------------------")
+            return
+
     if normalized_url in visited_urls:
         print("ALREADY SEEN : %s" % url)
         print("--------------------------")
@@ -114,7 +141,7 @@ def crawl_url(url):
     else:
         # input, error, etc (all other statuses)
         print("UNHANDLED    : %s" % url)
-        print("--------------------------")        
+        print("--------------------------")
 
 
 def main():
@@ -122,8 +149,10 @@ def main():
     index_writer = create_index(INDEX_DIR)
     global visited_urls
     visited_urls = []
+    global robot_file_map
+    robot_file_map = {}
     try:
-        for url in KNOWN_URLS:
+        for url in SEED_URLS:
             crawl_url(url)
     finally:
         index_writer.commit()
diff --git a/gus/serve.py b/gus/serve.py
@@ -32,11 +32,10 @@ def _render_footer():
 def index(request):
     data = _render_header()
     data.extend([
-        "Welcome to GUS, the Gemini search engine!",
+        "Welcome to GUS, a Gemini search engine!",
         "",
         "=> /search Search GUS",
-        "=> /known-servers List of known Gemini servers",
-        "=> /about About the indexer",
+        "=> /about About GUS",
         "=> gemini://gemini.circumlunar.space Gemini Project information"
     ])
     data.extend(_render_footer())
@@ -56,14 +55,17 @@ def index(request):
         "directory such that a request for \"robots.txt\" will",
         "fetch it.",
         "",
-        "GUS obeys User-agent of \"gus\" and \"*\"."
+        "GUS obeys User-agent of \"gus\" and \"*\".",
+        "",
+        "If you encounter problems with GUS, or have ideas",
+        "for its future, please email me at natpen@natpen.net.",
     ])
     data.extend(_render_footer())
     return Response(Status.SUCCESS, "text/gemini", "\n".join(data))
 
 
 def _search_index(query):
-    query = QueryParser("content", ix.schema).parse(query)        
+    query = QueryParser("content", ix.schema).parse(query)
     results = searcher.search(query)
     return (
         len(results),
@@ -80,7 +82,7 @@ def _render_results(results):
         data.append("=> {}".format(result[1]))
     return data
 
-    
+
 def _render_results_header(query, num_results):
     return [
         "| You searched for: \"{}\"".format(query),

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	37	+++++++++++++++++++++++++++++++++----
M	gus/serve.py	\|	14	++++++++------