commit 9ffc427a6c46bafe30d57669b581118139b9e448
parent 8bcf71965e48a9954bf3efdba115b084a1062f5d
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 18 May 2020 08:57:27 -0400
[serve] Crawl and index seed requests immediately
Diffstat:
4 files changed, 65 insertions(+), 18 deletions(-)
diff --git a/README.md b/README.md
@@ -49,3 +49,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
add a TODO to refactor the extract_gemini_links function to
exclude any links found within such a block.
- **track number of inbound links**
+
+
+TODO: note: statistics are broken with incremental crawls... specifically page count
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -173,7 +173,10 @@ def get_robots_file(robot_host):
if robot_host not in robot_file_map:
print("Requesting robots.txt for {}".format(robot_host))
robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt")
- r = gemini.fetch(robot_url)
+ try:
+ r = gemini.fetch(robot_url)
+ except Exception:
+ r = None
if r is not None and r.status.startswith("2"):
robot_file_parser = robotparser.RobotFileParser()
robot_file_parser.parse(r.content)
@@ -190,7 +193,7 @@ def crawl(gemini_resource):
gr = gemini_resource
for url_prefix in EXCLUDED_URL_PREFIXES:
if gr.normalized_url.startswith(url_prefix):
- print("MANUAL EXCLUSION SKIP : %s" % gr.raw_url)
+ print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url)
print("--------------------------")
return
@@ -203,11 +206,11 @@ def crawl(gemini_resource):
crawl_delay = max(robots_file.crawl_delay("*"), crawl_delay or 0) if robots_file.crawl_delay("*") else crawl_delay
crawl_delay = max(robots_file.crawl_delay("indexer"), crawl_delay or 0) if robots_file.crawl_delay("indexer") else crawl_delay
if not can_fetch:
- print("ROBOTS SKIP : %s" % gr.raw_url)
+ print("ROBOTS SKIP : %s" % gr.fully_qualified_url)
print("--------------------------")
return
if gr.normalized_url in visited_urls:
- print("ALREADY SEEN : %s" % gr.raw_url)
+ print("ALREADY SEEN : %s" % gr.fully_qualified_url)
print("--------------------------")
return
else:
@@ -228,19 +231,19 @@ def crawl(gemini_resource):
if r is None:
# problem before getting a response
- print("ERROR : %s" % gr.raw_url)
+ print("ERROR : %s" % gr.fully_qualified_url)
print("--------------------------")
crawl_statistics["broken_url_count"] += 1
elif r.status.startswith("3"):
# redirect status
- print("REDIRECT : %s -> %s" % (gr.raw_url, r.url))
+ print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, r.url))
# NB: this pop is necessary because if the redirect is a change to the URL
# structure of, essentially, the same URL (e.g., like the addition or removal
# of a trailing slash), then the crawl of the redirect would think it had
# already seen this resource in visited_urls' normalized source of truth.
visited_urls.pop()
crawl_statistics["redirect_count"] += 1
- # if is_nontrivial_redirect(gr.raw_url, r.url):
+ # if is_nontrivial_redirect(gr.fully_qualified_url, r.url):
# crawl_statistics["redirect_nontrivial_count"] += 1
redirect_resource = GeminiResource(r.url)
crawl(redirect_resource)
@@ -267,7 +270,7 @@ def crawl(gemini_resource):
print("--------------------------")
else:
# input, error, etc (all other statuses)
- print("UNHANDLED : %s" % gr.raw_url)
+ print("UNHANDLED : %s" % gr.fully_qualified_url)
print("--------------------------")
@@ -283,13 +286,17 @@ def persist_visited_urls(visited_urls):
pickle.dump( visited_urls, open( "visited_urls.p", "wb" ) )
-def main():
- args = parse_args()
- if args.should_run_destructive:
+def run_crawl(should_run_destructive=False, seed_urls=[]):
+ # TODO: track failed domain/page attempts, and don't reattempt for 15mins
+ if should_run_destructive:
+ # TODO: backup previous pickle instead of deleting (should be storing
+ # all crawl state together somewhere)
+ if Path("visited_urls.p").is_file():
+ os.remove("visited_urls.p")
create_index(INDEX_DIR)
global visited_urls
- visited_urls = [] if args.should_run_destructive else load_visited_urls()
+ visited_urls = [] if should_run_destructive else load_visited_urls()
global robot_file_map
robot_file_map = {}
global domain_hit_timings
@@ -302,13 +309,15 @@ def main():
"redirect_nontrivial_count": 0,
"broken_url_count": 0,
}
- seed_resources = [GeminiResource(url) for url in SEED_URLS]
+ seed_urls.extend(SEED_URLS)
+ seed_resources = [GeminiResource(url) for url in seed_urls]
for resource in seed_resources:
crawl(resource)
persist_visited_urls(visited_urls)
index_statistics = compute_index_statistics("index")
+ # TODO: move this printing to the stats module
print("Page count: {}".format(index_statistics["page_count"]))
print("Domain count: {}".format(index_statistics["domain_count"]))
print("Domains: {}".format(index_statistics["domains"]))
@@ -323,6 +332,11 @@ def main():
persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
+def main():
+ args = parse_args()
+ run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
+
+
def parse_args():
parser = argparse.ArgumentParser(description='Crawl Geminispace.')
parser.add_argument(
@@ -330,9 +344,18 @@ def parse_args():
"-d",
dest="should_run_destructive",
action="store_true",
+ default=False,
help="create a fresh index and perform a full Geminispace crawl",
)
- parser.set_defaults(should_run_destructive=False)
+ parser.add_argument(
+ "--seeds",
+ "-s",
+ metavar="URL",
+ dest="seed_urls",
+ nargs="+",
+ default=[],
+ help="one or more URLs with which to extend the seeds of the crawl",
+ )
args = parser.parse_args()
return args
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -31,7 +31,7 @@ def urlsplit_featureful(url, parent_resource=None):
# process relative link
if parent_resource is None:
return None
- joined = urljoin(parent_resource.normalized_host, url)
+ joined = urljoin("gemini://{}".format(parent_resource.normalized_host), url)
u = urlsplit(joined, 'gemini')
else: # url does not start with /
# could be: blah.com/test
@@ -59,6 +59,7 @@ class GeminiResource():
self.is_valid = self.urlsplit is not None
self._normalized_url = None
self._normalized_host = None
+ self._fully_qualified_url = None
self.contained_resources = None
@@ -78,14 +79,23 @@ class GeminiResource():
return self._normalized_host
+ def _get_fully_qualified_url(self):
+ if self.urlsplit is None:
+ return None
+ if self._fully_qualified_url is None:
+ self._fully_qualified_url = self.raw_url if not self.raw_url.startswith("/") else self.normalized_url
+ return self._fully_qualified_url
+
+
normalized_url = property(_get_normalized_url)
normalized_host = property(_get_normalized_host)
+ fully_qualified_url = property(_get_fully_qualified_url)
def fetch(self):
# NB: this intentionally does NOT fetch the normalized URL, because that could
# cause an infinite loop with, e.g., normalization stripping a trailing slash
# and a server redirecting to the same URL _with_ a trailing slash.
- response = gusmobile.fetch(self.raw_url)
+ response = gusmobile.fetch(self.fully_qualified_url)
self.response = response
return self.response
diff --git a/gus/serve.py b/gus/serve.py
@@ -6,7 +6,9 @@ from datetime import datetime
import math
import os
import re
+from subprocess import call
import sys
+import threading
import jetforce
from jetforce import Response, Status
@@ -14,6 +16,7 @@ from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from gus.lib.index_statistics import load_last_statistics_from_file
+from gus.crawl import run_crawl
last_statistics = load_last_statistics_from_file("statistics.csv")
app = jetforce.JetforceApplication()
@@ -328,17 +331,25 @@ def search(request):
return Response(Status.INPUT, "Search query")
+def crawl_seed_and_restart(seed_url):
+ run_crawl(should_run_destructive=False, seed_urls=[seed_url])
+ call(["sudo", "systemctl", "restart", "gus.service"])
+
+
@app.route("/add-seed")
def add_seed(request):
data = _render_header()
if request.query:
with open("seed-requests.txt", "a") as seed_file:
seed_file.write("{}\n".format(request.query))
+ crawl_thread = threading.Thread(name="crawl_thread",
+ target=crawl_seed_and_restart,
+ args=(request.query,))
+ crawl_thread.start()
data.extend([
"",
- "Thank you for the addition! GUS will attempt to crawl the following URL the next time the index is built.",
+ "Thank you for the addition! GUS is crawling and indexing this URL ({}) now, and it will show up in GUS search results as soon as the indexing completes.".format(request.query),
"",
- request.query
])
return Response(Status.SUCCESS, "text/gemini", "\n".join(data))
else: