commit 16eb249aa8f40774ef3613644ac8d7193d08f925
parent f4cbbc207f78f4be31259e6a0d5cc9859533c46d
Author: René Wagner <rwa@clttr.info>
Date: Fri, 10 Feb 2023 08:44:46 +0100
excludes, cosmetics, infra fixes
Diffstat:
5 files changed, 55 insertions(+), 40 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -150,3 +150,4 @@ dmypy.json
.viminfo
.vimrc
.profile
+crawl.lock
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -17,6 +17,7 @@ from . import constants
from gus.lib.db_model import init_db, Page, Link
from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
import gus.lib.logging
+from gus.lib.logging import strip_control_chars
# hack: the built-in methods in urllib need to know the
# Gemini protocol exists
@@ -36,7 +37,7 @@ EXCLUDED_URL_PATTERN = re.compile(
def index_binary(resource, response):
logging.debug(
"Indexing binary for: %s",
- gus.lib.logging.strip_control_chars(resource.normalized_url),
+ strip_control_chars(resource.normalized_url),
)
doc = {
@@ -72,7 +73,7 @@ def index_binary(resource, response):
try:
page.save()
except:
- logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
return page
@@ -80,7 +81,7 @@ def index_binary(resource, response):
def index_redirect(resource, response):
logging.debug(
"Indexing redirect for: %s",
- gus.lib.logging.strip_control_chars(resource.normalized_url),
+ strip_control_chars(resource.normalized_url),
)
doc = {
@@ -113,7 +114,7 @@ def index_redirect(resource, response):
try:
page.save()
except:
- logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
return page
@@ -144,7 +145,7 @@ def index_error(resource, is_temporary, response):
try:
page.save()
except:
- logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
return page
@@ -152,7 +153,7 @@ def index_error(resource, is_temporary, response):
def index_prompt(resource, response):
logging.debug(
"Indexing prompt for: %s",
- gus.lib.logging.strip_control_chars(resource.normalized_url),
+ strip_control_chars(resource.normalized_url),
)
doc = {
@@ -189,7 +190,7 @@ def index_prompt(resource, response):
try:
page.save()
except:
- logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
return page
@@ -197,7 +198,7 @@ def index_prompt(resource, response):
def index_content(resource, response):
logging.debug(
"Indexing content for: %s",
- gus.lib.logging.strip_control_chars(resource.normalized_url),
+ strip_control_chars(resource.normalized_url),
)
doc = {
@@ -243,7 +244,7 @@ def index_content(resource, response):
try:
page.save()
except:
- logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
return page, is_different
@@ -263,7 +264,7 @@ def should_skip(resource):
if m:
should_skip = True
except:
- logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.raw_url))
+ logging.error("Error checking for exclude of url: %s", strip_control_chars(resource.raw_url))
should_skip = True
return should_skip
@@ -289,7 +290,7 @@ def index_links(from_resource, contained_resources):
fetchable_url=cr.fetchable_url,
domain=cr.normalized_host,
port=cr.urlsplit.port or 1965,
- first_seen_at=datetime.now()
+ first_seen_at=datetime.utcnow()
)
data.append(
{
@@ -307,7 +308,7 @@ def index_links(from_resource, contained_resources):
def fetch_robots_file(robot_host):
robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt")
logging.info(
- "Fetching robots file: %s", gus.lib.logging.strip_control_chars(robot_url)
+ "Fetching robots file: %s", strip_control_chars(robot_url)
)
rp = GeminiRobotFileParser(robot_url)
rp.read()
@@ -328,34 +329,34 @@ def crawl_page(
if not gemini_resource.is_valid:
logging.warn(
"Not a valid gemini resource, skipping: %s",
- gus.lib.logging.strip_control_chars(gemini_resource.url),
+ strip_control_chars(gemini_resource.url),
)
return
if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
logging.warn(
- "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
+ "Going too deep, skipping: %s", strip_control_chars(url)
)
return
if should_skip(gr):
logging.debug(
"URL is excluded, skipping: %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
)
return
if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
logging.debug(
- "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url)
+ "Too many failed requests for host, skipping: %s", strip_control_chars(url)
)
return
existing_page = Page.get_or_none(url=gr.normalized_url)
if existing_page and existing_page.change_frequency is not None:
most_recent_crawl = existing_page.last_crawl_at
- if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(
+ if most_recent_crawl and datetime.utcnow() < most_recent_crawl + timedelta(
hours=existing_page.change_frequency):
logging.debug(
"Too soon to recrawl, skipping: %s",
- gus.lib.logging.strip_control_chars(gr.fetchable_url),
+ strip_control_chars(gr.fetchable_url),
)
return
@@ -374,7 +375,7 @@ def crawl_page(
if not can_fetch:
logging.debug(
"Blocked by robots.txt, skipping: %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
)
return
@@ -392,22 +393,22 @@ def crawl_page(
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
milliseconds=crawl_delay
)
- sleep_duration = max((next_allowed_hit - datetime.now()).total_seconds(), 0)
+ sleep_duration = max((next_allowed_hit - datetime.utcnow()).total_seconds(), 0)
time.sleep(sleep_duration)
- domain_hit_timings[gr.normalized_host] = datetime.now()
+ domain_hit_timings[gr.normalized_host] = datetime.utcnow()
# Actually fetch!
- logging.info("Fetching resource: %s", gus.lib.logging.strip_control_chars(url))
+ logging.info("Fetching resource: %s", strip_control_chars(url))
if gr.fully_qualified_parent_url is not None:
logging.debug(
"with parent: %s",
- gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url),
+ strip_control_chars(gr.fully_qualified_parent_url),
)
response = gr.fetch()
if response is None:
# problem before getting a response
- logging.warn("Failed to fetch: %s", gus.lib.logging.strip_control_chars(url))
+ logging.warn("Failed to fetch: %s", strip_control_chars(url))
page = index_error(gr, True, None)
failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1
@@ -419,7 +420,7 @@ def crawl_page(
# temporary error status
logging.debug(
"Got temporary error: %s: %s %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
response.status,
response.error_message,
)
@@ -429,7 +430,7 @@ def crawl_page(
# permanent error status
logging.debug(
"Got permanent error: %s: %s %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
response.status,
response.error_message,
)
@@ -439,14 +440,14 @@ def crawl_page(
# redirect status
logging.debug(
"Got redirected: %s: %s %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
response.status,
response.url,
)
if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH:
logging.info(
"Aborting, maximum redirect chain length reached: %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
)
return
redirect_resource = GeminiResource(
@@ -455,7 +456,7 @@ def crawl_page(
if redirect_resource.fetchable_url == gr.fetchable_url:
logging.info(
"Aborting, redirecting to self: %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
)
return
page = index_redirect(gr, response)
@@ -470,7 +471,7 @@ def crawl_page(
# input status
logging.debug(
"Input requested at: %s: %s %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
response.status,
response.prompt,
)
@@ -479,7 +480,7 @@ def crawl_page(
# success status
logging.debug(
"Successful request: %s: %s %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
response.status,
response.content_type,
)
@@ -488,13 +489,13 @@ def crawl_page(
if response.content_type != "text/gemini":
logging.debug(
"Content is not gemini text: %s: %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
response.content_type,
)
else:
logging.debug(
"Got gemini text, extracting and crawling links: %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
)
contained_resources = gr.extract_contained_resources(response.content)
index_links(gr, contained_resources)
@@ -508,7 +509,7 @@ def crawl_page(
else:
logging.warn(
"Got unhandled status: %s: %s",
- gus.lib.logging.strip_control_chars(url),
+ strip_control_chars(url),
response.status,
)
@@ -519,7 +520,7 @@ def load_expired_urls():
FROM page as p
WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') OR last_crawl_at IS NULL""" )
return [page.url for page in expired_pages.execute()]
-# expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.now() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True))
+# expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.utcnow() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True))
# return expired_pages
def load_seed_request_urls():
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -144,6 +144,9 @@ EXCLUDED_URL_PREFIXES = [
"gemini://chat.mozz.us/stream",
"gemini://chat.mozz.us/submit",
+ # gempod
+ "gemini://rocketcaster.xyz/share/",
+
# gopher proxy
"gemini://80h.dev/agena/",
diff --git a/infra/rebuild_index.sh b/infra/rebuild_index.sh
@@ -1,8 +1,17 @@
-mkdir -p /home/gus/index.new/
+#!/bin/bash
+BASEDIR=${1}
+
+LOCKFILE=${BASEDIR}/crawl.lock
+
+touch ${LOCKFILE}
+
+mkdir -p ${BASEDIR}/index.new/
sudo systemctl stop gus
-cp /home/gus/index/gus.sqlite /home/gus/index.new/
+cp ${BASEDIR}/index/gus.sqlite ${BASEDIR}/index.new/
sudo systemctl start gus
-/home/gus/.poetry/bin/poetry run build_index -d
-mv /home/gus/index /home/gus/index.old
-mv /home/gus/index.new /home/gus/index
+${BASEDIR}/.poetry/bin/poetry run build_index -d
+mv ${BASEDIR}/index ${BASEDIR}/index.old
+mv ${BASEDIR}/index.new ${BASEDIR}/index
sudo systemctl restart gus
+
+rm ${LOCKFILE}
diff --git a/infra/update_index.sh b/infra/update_index.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
BASEDIR=${1}
LOCKFILE=${BASEDIR}/crawl.lock