geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 9ffc427a6c46bafe30d57669b581118139b9e448
parent 8bcf71965e48a9954bf3efdba115b084a1062f5d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 18 May 2020 08:57:27 -0400

[serve] Crawl and index seed requests immediately

Diffstat:
MREADME.md | 3+++
Mgus/crawl.py | 51+++++++++++++++++++++++++++++++++++++--------------
Mgus/lib/gemini.py | 14++++++++++++--
Mgus/serve.py | 15+++++++++++++--
4 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md @@ -49,3 +49,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett add a TODO to refactor the extract_gemini_links function to exclude any links found within such a block. - **track number of inbound links** + + +TODO: note: statistics are broken with incremental crawls... specifically page count diff --git a/gus/crawl.py b/gus/crawl.py @@ -173,7 +173,10 @@ def get_robots_file(robot_host): if robot_host not in robot_file_map: print("Requesting robots.txt for {}".format(robot_host)) robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt") - r = gemini.fetch(robot_url) + try: + r = gemini.fetch(robot_url) + except Exception: + r = None if r is not None and r.status.startswith("2"): robot_file_parser = robotparser.RobotFileParser() robot_file_parser.parse(r.content) @@ -190,7 +193,7 @@ def crawl(gemini_resource): gr = gemini_resource for url_prefix in EXCLUDED_URL_PREFIXES: if gr.normalized_url.startswith(url_prefix): - print("MANUAL EXCLUSION SKIP : %s" % gr.raw_url) + print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url) print("--------------------------") return @@ -203,11 +206,11 @@ def crawl(gemini_resource): crawl_delay = max(robots_file.crawl_delay("*"), crawl_delay or 0) if robots_file.crawl_delay("*") else crawl_delay crawl_delay = max(robots_file.crawl_delay("indexer"), crawl_delay or 0) if robots_file.crawl_delay("indexer") else crawl_delay if not can_fetch: - print("ROBOTS SKIP : %s" % gr.raw_url) + print("ROBOTS SKIP : %s" % gr.fully_qualified_url) print("--------------------------") return if gr.normalized_url in visited_urls: - print("ALREADY SEEN : %s" % gr.raw_url) + print("ALREADY SEEN : %s" % gr.fully_qualified_url) print("--------------------------") return else: @@ -228,19 +231,19 @@ def crawl(gemini_resource): if r is None: # problem before getting a response - print("ERROR : %s" % gr.raw_url) + print("ERROR : %s" % gr.fully_qualified_url) print("--------------------------") crawl_statistics["broken_url_count"] += 1 elif r.status.startswith("3"): # redirect status - print("REDIRECT : %s -> %s" % (gr.raw_url, r.url)) + print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, r.url)) # NB: this pop is necessary because if the redirect is a change to the URL # structure of, essentially, the same URL (e.g., like the addition or removal # of a trailing slash), then the crawl of the redirect would think it had # already seen this resource in visited_urls' normalized source of truth. visited_urls.pop() crawl_statistics["redirect_count"] += 1 - # if is_nontrivial_redirect(gr.raw_url, r.url): + # if is_nontrivial_redirect(gr.fully_qualified_url, r.url): # crawl_statistics["redirect_nontrivial_count"] += 1 redirect_resource = GeminiResource(r.url) crawl(redirect_resource) @@ -267,7 +270,7 @@ def crawl(gemini_resource): print("--------------------------") else: # input, error, etc (all other statuses) - print("UNHANDLED : %s" % gr.raw_url) + print("UNHANDLED : %s" % gr.fully_qualified_url) print("--------------------------") @@ -283,13 +286,17 @@ def persist_visited_urls(visited_urls): pickle.dump( visited_urls, open( "visited_urls.p", "wb" ) ) -def main(): - args = parse_args() - if args.should_run_destructive: +def run_crawl(should_run_destructive=False, seed_urls=[]): + # TODO: track failed domain/page attempts, and don't reattempt for 15mins + if should_run_destructive: + # TODO: backup previous pickle instead of deleting (should be storing + # all crawl state together somewhere) + if Path("visited_urls.p").is_file(): + os.remove("visited_urls.p") create_index(INDEX_DIR) global visited_urls - visited_urls = [] if args.should_run_destructive else load_visited_urls() + visited_urls = [] if should_run_destructive else load_visited_urls() global robot_file_map robot_file_map = {} global domain_hit_timings @@ -302,13 +309,15 @@ def main(): "redirect_nontrivial_count": 0, "broken_url_count": 0, } - seed_resources = [GeminiResource(url) for url in SEED_URLS] + seed_urls.extend(SEED_URLS) + seed_resources = [GeminiResource(url) for url in seed_urls] for resource in seed_resources: crawl(resource) persist_visited_urls(visited_urls) index_statistics = compute_index_statistics("index") + # TODO: move this printing to the stats module print("Page count: {}".format(index_statistics["page_count"])) print("Domain count: {}".format(index_statistics["domain_count"])) print("Domains: {}".format(index_statistics["domains"])) @@ -323,6 +332,11 @@ def main(): persist_statistics(index_statistics, crawl_statistics, "statistics.csv") +def main(): + args = parse_args() + run_crawl(args.should_run_destructive, seed_urls=args.seed_urls) + + def parse_args(): parser = argparse.ArgumentParser(description='Crawl Geminispace.') parser.add_argument( @@ -330,9 +344,18 @@ def parse_args(): "-d", dest="should_run_destructive", action="store_true", + default=False, help="create a fresh index and perform a full Geminispace crawl", ) - parser.set_defaults(should_run_destructive=False) + parser.add_argument( + "--seeds", + "-s", + metavar="URL", + dest="seed_urls", + nargs="+", + default=[], + help="one or more URLs with which to extend the seeds of the crawl", + ) args = parser.parse_args() return args diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -31,7 +31,7 @@ def urlsplit_featureful(url, parent_resource=None): # process relative link if parent_resource is None: return None - joined = urljoin(parent_resource.normalized_host, url) + joined = urljoin("gemini://{}".format(parent_resource.normalized_host), url) u = urlsplit(joined, 'gemini') else: # url does not start with / # could be: blah.com/test @@ -59,6 +59,7 @@ class GeminiResource(): self.is_valid = self.urlsplit is not None self._normalized_url = None self._normalized_host = None + self._fully_qualified_url = None self.contained_resources = None @@ -78,14 +79,23 @@ class GeminiResource(): return self._normalized_host + def _get_fully_qualified_url(self): + if self.urlsplit is None: + return None + if self._fully_qualified_url is None: + self._fully_qualified_url = self.raw_url if not self.raw_url.startswith("/") else self.normalized_url + return self._fully_qualified_url + + normalized_url = property(_get_normalized_url) normalized_host = property(_get_normalized_host) + fully_qualified_url = property(_get_fully_qualified_url) def fetch(self): # NB: this intentionally does NOT fetch the normalized URL, because that could # cause an infinite loop with, e.g., normalization stripping a trailing slash # and a server redirecting to the same URL _with_ a trailing slash. - response = gusmobile.fetch(self.raw_url) + response = gusmobile.fetch(self.fully_qualified_url) self.response = response return self.response diff --git a/gus/serve.py b/gus/serve.py @@ -6,7 +6,9 @@ from datetime import datetime import math import os import re +from subprocess import call import sys +import threading import jetforce from jetforce import Response, Status @@ -14,6 +16,7 @@ from whoosh.index import open_dir from whoosh.qparser import MultifieldParser from gus.lib.index_statistics import load_last_statistics_from_file +from gus.crawl import run_crawl last_statistics = load_last_statistics_from_file("statistics.csv") app = jetforce.JetforceApplication() @@ -328,17 +331,25 @@ def search(request): return Response(Status.INPUT, "Search query") +def crawl_seed_and_restart(seed_url): + run_crawl(should_run_destructive=False, seed_urls=[seed_url]) + call(["sudo", "systemctl", "restart", "gus.service"]) + + @app.route("/add-seed") def add_seed(request): data = _render_header() if request.query: with open("seed-requests.txt", "a") as seed_file: seed_file.write("{}\n".format(request.query)) + crawl_thread = threading.Thread(name="crawl_thread", + target=crawl_seed_and_restart, + args=(request.query,)) + crawl_thread.start() data.extend([ "", - "Thank you for the addition! GUS will attempt to crawl the following URL the next time the index is built.", + "Thank you for the addition! GUS is crawling and indexing this URL ({}) now, and it will show up in GUS search results as soon as the indexing completes.".format(request.query), "", - request.query ]) return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) else: