[serve] Crawl and index seed requests immediately - geminispace.info

commit 9ffc427a6c46bafe30d57669b581118139b9e448
parent 8bcf71965e48a9954bf3efdba115b084a1062f5d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 18 May 2020 08:57:27 -0400

[serve] Crawl and index seed requests immediately

Diffstat:
M README.md  | 3 +++
M gus/crawl.py  | 51 +++++++++++++++++++++++++++++++++++++--------------
M gus/lib/gemini.py  | 14 ++++++++++++--
M gus/serve.py  | 15 +++++++++++++--

4 files changed, 65 insertions(+), 18 deletions(-)
diff --git a/README.md b/README.md
@@ -49,3 +49,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
   add a TODO to refactor the extract_gemini_links function to
   exclude any links found within such a block.
 - **track number of inbound links**
+
+
+TODO: note: statistics are broken with incremental crawls... specifically page count
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -173,7 +173,10 @@ def get_robots_file(robot_host):
     if robot_host not in robot_file_map:
         print("Requesting robots.txt for {}".format(robot_host))
         robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt")
-        r = gemini.fetch(robot_url)
+        try:
+            r = gemini.fetch(robot_url)
+        except Exception:
+            r = None
         if r is not None and r.status.startswith("2"):
             robot_file_parser = robotparser.RobotFileParser()
             robot_file_parser.parse(r.content)
@@ -190,7 +193,7 @@ def crawl(gemini_resource):
     gr = gemini_resource
     for url_prefix in EXCLUDED_URL_PREFIXES:
         if gr.normalized_url.startswith(url_prefix):
-            print("MANUAL EXCLUSION SKIP  : %s" % gr.raw_url)
+            print("MANUAL EXCLUSION SKIP  : %s" % gr.fully_qualified_url)
             print("--------------------------")
             return
 
@@ -203,11 +206,11 @@ def crawl(gemini_resource):
         crawl_delay = max(robots_file.crawl_delay("*"), crawl_delay or 0) if robots_file.crawl_delay("*") else crawl_delay
         crawl_delay = max(robots_file.crawl_delay("indexer"), crawl_delay or 0) if robots_file.crawl_delay("indexer") else crawl_delay
         if not can_fetch:
-            print("ROBOTS SKIP  : %s" % gr.raw_url)
+            print("ROBOTS SKIP  : %s" % gr.fully_qualified_url)
             print("--------------------------")
             return
     if gr.normalized_url in visited_urls:
-        print("ALREADY SEEN : %s" % gr.raw_url)
+        print("ALREADY SEEN : %s" % gr.fully_qualified_url)
         print("--------------------------")
         return
     else:
@@ -228,19 +231,19 @@ def crawl(gemini_resource):
 
     if r is None:
         # problem before getting a response
-        print("ERROR        : %s" % gr.raw_url)
+        print("ERROR        : %s" % gr.fully_qualified_url)
         print("--------------------------")
         crawl_statistics["broken_url_count"] += 1
     elif r.status.startswith("3"):
         # redirect status
-        print("REDIRECT     : %s -> %s" % (gr.raw_url, r.url))
+        print("REDIRECT     : %s -> %s" % (gr.fully_qualified_url, r.url))
         # NB: this pop is necessary because if the redirect is a change to the URL
         # structure of, essentially, the same URL (e.g., like the addition or removal
         # of a trailing slash), then the crawl of the redirect would think it had
         # already seen this resource in visited_urls' normalized source of truth.
         visited_urls.pop()
         crawl_statistics["redirect_count"] += 1
-        # if is_nontrivial_redirect(gr.raw_url, r.url):
+        # if is_nontrivial_redirect(gr.fully_qualified_url, r.url):
             # crawl_statistics["redirect_nontrivial_count"] += 1
         redirect_resource = GeminiResource(r.url)
         crawl(redirect_resource)
@@ -267,7 +270,7 @@ def crawl(gemini_resource):
             print("--------------------------")
     else:
         # input, error, etc (all other statuses)
-        print("UNHANDLED    : %s" % gr.raw_url)
+        print("UNHANDLED    : %s" % gr.fully_qualified_url)
         print("--------------------------")
 
 
@@ -283,13 +286,17 @@ def persist_visited_urls(visited_urls):
     pickle.dump( visited_urls, open( "visited_urls.p", "wb" ) )
 
 
-def main():
-    args = parse_args()
-    if args.should_run_destructive:
+def run_crawl(should_run_destructive=False, seed_urls=[]):
+    # TODO: track failed domain/page attempts, and don't reattempt for 15mins
+    if should_run_destructive:
+        # TODO: backup previous pickle instead of deleting (should be storing
+        # all crawl state together somewhere)
+        if Path("visited_urls.p").is_file():
+            os.remove("visited_urls.p")
         create_index(INDEX_DIR)
 
     global visited_urls
-    visited_urls = [] if args.should_run_destructive else load_visited_urls()
+    visited_urls = [] if should_run_destructive else load_visited_urls()
     global robot_file_map
     robot_file_map = {}
     global domain_hit_timings
@@ -302,13 +309,15 @@ def main():
         "redirect_nontrivial_count": 0,
         "broken_url_count": 0,
     }
-    seed_resources = [GeminiResource(url) for url in SEED_URLS]
+    seed_urls.extend(SEED_URLS)
+    seed_resources = [GeminiResource(url) for url in seed_urls]
     for resource in seed_resources:
         crawl(resource)
 
     persist_visited_urls(visited_urls)
 
     index_statistics = compute_index_statistics("index")
+    # TODO: move this printing to the stats module
     print("Page count: {}".format(index_statistics["page_count"]))
     print("Domain count: {}".format(index_statistics["domain_count"]))
     print("Domains: {}".format(index_statistics["domains"]))
@@ -323,6 +332,11 @@ def main():
     persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
 
 
+def main():
+    args = parse_args()
+    run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
+
+
 def parse_args():
     parser = argparse.ArgumentParser(description='Crawl Geminispace.')
     parser.add_argument(
@@ -330,9 +344,18 @@ def parse_args():
         "-d",
         dest="should_run_destructive",
         action="store_true",
+        default=False,
         help="create a fresh index and perform a full Geminispace crawl",
     )
-    parser.set_defaults(should_run_destructive=False)
+    parser.add_argument(
+        "--seeds",
+        "-s",
+        metavar="URL",
+        dest="seed_urls",
+        nargs="+",
+        default=[],
+        help="one or more URLs with which to extend the seeds of the crawl",
+    )
     args = parser.parse_args()
     return args
 
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -31,7 +31,7 @@ def urlsplit_featureful(url, parent_resource=None):
             # process relative link
             if parent_resource is None:
                 return None
-            joined = urljoin(parent_resource.normalized_host, url)
+            joined = urljoin("gemini://{}".format(parent_resource.normalized_host), url)
             u = urlsplit(joined, 'gemini')
         else: # url does not start with /
             # could be: blah.com/test
@@ -59,6 +59,7 @@ class GeminiResource():
         self.is_valid = self.urlsplit is not None
         self._normalized_url = None
         self._normalized_host = None
+        self._fully_qualified_url = None
         self.contained_resources = None
 
 
@@ -78,14 +79,23 @@ class GeminiResource():
         return self._normalized_host
 
 
+    def _get_fully_qualified_url(self):
+        if self.urlsplit is None:
+            return None
+        if self._fully_qualified_url is None:
+            self._fully_qualified_url = self.raw_url if not self.raw_url.startswith("/") else self.normalized_url
+        return self._fully_qualified_url
+
+
     normalized_url = property(_get_normalized_url)
     normalized_host = property(_get_normalized_host)
+    fully_qualified_url = property(_get_fully_qualified_url)
 
     def fetch(self):
         # NB: this intentionally does NOT fetch the normalized URL, because that could
         # cause an infinite loop with, e.g., normalization stripping a trailing slash
         # and a server redirecting to the same URL _with_ a trailing slash.
-        response = gusmobile.fetch(self.raw_url)
+        response = gusmobile.fetch(self.fully_qualified_url)
         self.response = response
         return self.response
 
diff --git a/gus/serve.py b/gus/serve.py
@@ -6,7 +6,9 @@ from datetime import datetime
 import math
 import os
 import re
+from subprocess import call
 import sys
+import threading
 
 import jetforce
 from jetforce import Response, Status
@@ -14,6 +16,7 @@ from whoosh.index import open_dir
 from whoosh.qparser import MultifieldParser
 
 from gus.lib.index_statistics import load_last_statistics_from_file
+from gus.crawl import run_crawl
 
 last_statistics = load_last_statistics_from_file("statistics.csv")
 app = jetforce.JetforceApplication()
@@ -328,17 +331,25 @@ def search(request):
         return Response(Status.INPUT, "Search query")
 
 
+def crawl_seed_and_restart(seed_url):
+    run_crawl(should_run_destructive=False, seed_urls=[seed_url])
+    call(["sudo", "systemctl", "restart", "gus.service"])
+
+
 @app.route("/add-seed")
 def add_seed(request):
     data = _render_header()
     if request.query:
         with open("seed-requests.txt", "a") as seed_file:
             seed_file.write("{}\n".format(request.query))
+        crawl_thread = threading.Thread(name="crawl_thread",
+                                        target=crawl_seed_and_restart,
+                                        args=(request.query,))
+        crawl_thread.start()
         data.extend([
             "",
-            "Thank you for the addition! GUS will attempt to crawl the following URL the next time the index is built.",
+            "Thank you for the addition! GUS is crawling and indexing this URL ({}) now, and it will show up in GUS search results as soon as the indexing completes.".format(request.query),
             "",
-            request.query
         ])
         return Response(Status.SUCCESS, "text/gemini", "\n".join(data))
     else:

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	README.md	\|	3	+++
M	gus/crawl.py	\|	51	+++++++++++++++++++++++++++++++++++++--------------
M	gus/lib/gemini.py	\|	14	++++++++++++--
M	gus/serve.py	\|	15	+++++++++++++--