geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 80ad98435233fab6ae7642bb603353d01b577f2b
parent b8f73c5617bd502d6d952b2714eee404d8132eca
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri, 21 Feb 2020 08:44:01 -0500

Respect robots.txt

Diffstat:
Mgus/crawl.py | 37+++++++++++++++++++++++++++++++++----
Mgus/serve.py | 14++++++++------
2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -1,6 +1,7 @@ import pathlib import re import shutil +from urllib import robotparser from urllib.parse import urlparse, urlunparse import gusmobile as gemini @@ -9,7 +10,7 @@ from whoosh.index import create_in INDEX_DIR = "index" -KNOWN_URLS = [ +SEED_URLS = [ "gemini://gemini.conman.org/", "gemini://zaibatsu.circumlunar.space:1965", "gemini://carcosa.net:1965", @@ -72,10 +73,26 @@ def index_content(response): print("INDEXING...") index_writer.add_document( url=response.url, - content=response.content, + content=response.url + " " + response.content, ) +def get_robots_file(url): + robot_host = url.split(":1965", 1)[0] + ":1965" + if robot_host not in robot_file_map: + print(f"Requesting robots.txt for {robot_host}") + robot_url = robot_host + "/robots.txt" + r = gemini.fetch(robot_url) + if r is not None and r.status.startswith("2"): + robot_file_parser = robotparser.RobotFileParser() + robot_file_parser.parse(r.content) + print(robot_file_parser.site_maps()) + robot_file_map[robot_host] = robot_file_parser + else: + robot_file_map[robot_host] = None + return robot_file_map[robot_host] + + def crawl_url(url): u = urlparse(url, 'gemini') url = urlunparse(u) @@ -85,6 +102,16 @@ def crawl_url(url): print("--------------------------") return normalized_url = normalize_gemini_url(url) + if normalized_url.startswith("gemini://example.org"): + return + robots_file = get_robots_file(normalized_url) + if robots_file is not None: + can_fetch = robots_file.can_fetch("gus", normalized_url) and robots_file.can_fetch("*", normalized_url) + if not can_fetch: + print("ROBOTS SKIP : %s" % url) + print("--------------------------") + return + if normalized_url in visited_urls: print("ALREADY SEEN : %s" % url) print("--------------------------") @@ -114,7 +141,7 @@ def crawl_url(url): else: # input, error, etc (all other statuses) print("UNHANDLED : %s" % url) - print("--------------------------") + print("--------------------------") def main(): @@ -122,8 +149,10 @@ def main(): index_writer = create_index(INDEX_DIR) global visited_urls visited_urls = [] + global robot_file_map + robot_file_map = {} try: - for url in KNOWN_URLS: + for url in SEED_URLS: crawl_url(url) finally: index_writer.commit() diff --git a/gus/serve.py b/gus/serve.py @@ -32,11 +32,10 @@ def _render_footer(): def index(request): data = _render_header() data.extend([ - "Welcome to GUS, the Gemini search engine!", + "Welcome to GUS, a Gemini search engine!", "", "=> /search Search GUS", - "=> /known-servers List of known Gemini servers", - "=> /about About the indexer", + "=> /about About GUS", "=> gemini://gemini.circumlunar.space Gemini Project information" ]) data.extend(_render_footer()) @@ -56,14 +55,17 @@ def index(request): "directory such that a request for \"robots.txt\" will", "fetch it.", "", - "GUS obeys User-agent of \"gus\" and \"*\"." + "GUS obeys User-agent of \"gus\" and \"*\".", + "", + "If you encounter problems with GUS, or have ideas", + "for its future, please email me at natpen@natpen.net.", ]) data.extend(_render_footer()) return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) def _search_index(query): - query = QueryParser("content", ix.schema).parse(query) + query = QueryParser("content", ix.schema).parse(query) results = searcher.search(query) return ( len(results), @@ -80,7 +82,7 @@ def _render_results(results): data.append("=> {}".format(result[1])) return data - + def _render_results_header(query, num_results): return [ "| You searched for: \"{}\"".format(query),