excludes, cosmetics, infra fixes - geminispace.info

commit 16eb249aa8f40774ef3613644ac8d7193d08f925
parent f4cbbc207f78f4be31259e6a0d5cc9859533c46d
Author: René Wagner <rwa@clttr.info>
Date:   Fri, 10 Feb 2023 08:44:46 +0100

excludes, cosmetics, infra fixes

Diffstat:
M .gitignore  | 1 +
M gus/crawl.py  | 71 ++++++++++++++++++++++++++++++++++++-----------------------------------
M gus/excludes.py  | 3 +++
M infra/rebuild_index.sh  | 19 ++++++++++++++-----
M infra/update_index.sh  | 1 +

5 files changed, 55 insertions(+), 40 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -150,3 +150,4 @@ dmypy.json
 .viminfo
 .vimrc
 .profile
+crawl.lock
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -17,6 +17,7 @@ from . import constants
 from gus.lib.db_model import init_db, Page, Link
 from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
 import gus.lib.logging
+from gus.lib.logging import strip_control_chars
 
 # hack: the built-in methods in urllib need to know the
 # Gemini protocol exists
@@ -36,7 +37,7 @@ EXCLUDED_URL_PATTERN = re.compile(
 def index_binary(resource, response):
     logging.debug(
         "Indexing binary for: %s",
-        gus.lib.logging.strip_control_chars(resource.normalized_url),
+        strip_control_chars(resource.normalized_url),
     )
 
     doc = {
@@ -72,7 +73,7 @@ def index_binary(resource, response):
     try:
         page.save()
     except:
-        logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+        logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
 
     return page
 
@@ -80,7 +81,7 @@ def index_binary(resource, response):
 def index_redirect(resource, response):
     logging.debug(
         "Indexing redirect for: %s",
-        gus.lib.logging.strip_control_chars(resource.normalized_url),
+        strip_control_chars(resource.normalized_url),
     )
 
     doc = {
@@ -113,7 +114,7 @@ def index_redirect(resource, response):
     try:
         page.save()
     except:
-        logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+        logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
     
     return page
 
@@ -144,7 +145,7 @@ def index_error(resource, is_temporary, response):
     try:
         page.save()
     except:
-        logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+        logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
     
     return page
 
@@ -152,7 +153,7 @@ def index_error(resource, is_temporary, response):
 def index_prompt(resource, response):
     logging.debug(
         "Indexing prompt for: %s",
-        gus.lib.logging.strip_control_chars(resource.normalized_url),
+        strip_control_chars(resource.normalized_url),
     )
 
     doc = {
@@ -189,7 +190,7 @@ def index_prompt(resource, response):
     try:
         page.save()
     except:
-        logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) 
+        logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) 
     
     return page
 
@@ -197,7 +198,7 @@ def index_prompt(resource, response):
 def index_content(resource, response):
     logging.debug(
         "Indexing content for: %s",
-        gus.lib.logging.strip_control_chars(resource.normalized_url),
+        strip_control_chars(resource.normalized_url),
     )
 
     doc = {
@@ -243,7 +244,7 @@ def index_content(resource, response):
     try:
         page.save()
     except:
-        logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
+        logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
     
     return page, is_different
 
@@ -263,7 +264,7 @@ def should_skip(resource):
         if m:
             should_skip = True
     except:
-        logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.raw_url))
+        logging.error("Error checking for exclude of url: %s", strip_control_chars(resource.raw_url))
         should_skip = True
 
     return should_skip
@@ -289,7 +290,7 @@ def index_links(from_resource, contained_resources):
                 fetchable_url=cr.fetchable_url,
                 domain=cr.normalized_host,
                 port=cr.urlsplit.port or 1965,
-                first_seen_at=datetime.now()
+                first_seen_at=datetime.utcnow()
             )
         data.append(
             {
@@ -307,7 +308,7 @@ def index_links(from_resource, contained_resources):
 def fetch_robots_file(robot_host):
     robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt")
     logging.info(
-        "Fetching robots file: %s", gus.lib.logging.strip_control_chars(robot_url)
+        "Fetching robots file: %s", strip_control_chars(robot_url)
     )
     rp = GeminiRobotFileParser(robot_url)
     rp.read()
@@ -328,34 +329,34 @@ def crawl_page(
     if not gemini_resource.is_valid:
         logging.warn(
             "Not a valid gemini resource, skipping: %s",
-            gus.lib.logging.strip_control_chars(gemini_resource.url),
+            strip_control_chars(gemini_resource.url),
         )
         return
     if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
         logging.warn(
-            "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
+            "Going too deep, skipping: %s", strip_control_chars(url)
         )
         return
     if should_skip(gr):
         logging.debug(
             "URL is excluded, skipping: %s",
-            gus.lib.logging.strip_control_chars(url),
+            strip_control_chars(url),
         )
         return
     if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
         logging.debug(
-            "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url)
+            "Too many failed requests for host, skipping: %s", strip_control_chars(url)
         )
         return
     
     existing_page = Page.get_or_none(url=gr.normalized_url)
     if existing_page and existing_page.change_frequency is not None:
         most_recent_crawl = existing_page.last_crawl_at
-        if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(
+        if most_recent_crawl and datetime.utcnow() < most_recent_crawl + timedelta(
             hours=existing_page.change_frequency):
             logging.debug(
                 "Too soon to recrawl, skipping: %s",
-                gus.lib.logging.strip_control_chars(gr.fetchable_url),
+                strip_control_chars(gr.fetchable_url),
             )
             return
 
@@ -374,7 +375,7 @@ def crawl_page(
         if not can_fetch:
             logging.debug(
                 "Blocked by robots.txt, skipping: %s",
-                gus.lib.logging.strip_control_chars(url),
+                strip_control_chars(url),
             )
             return
 
@@ -392,22 +393,22 @@ def crawl_page(
             next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
                 milliseconds=crawl_delay
             )
-        sleep_duration = max((next_allowed_hit - datetime.now()).total_seconds(), 0)
+        sleep_duration = max((next_allowed_hit - datetime.utcnow()).total_seconds(), 0)
         time.sleep(sleep_duration)
-    domain_hit_timings[gr.normalized_host] = datetime.now()
+    domain_hit_timings[gr.normalized_host] = datetime.utcnow()
 
     # Actually fetch!
-    logging.info("Fetching resource: %s", gus.lib.logging.strip_control_chars(url))
+    logging.info("Fetching resource: %s", strip_control_chars(url))
     if gr.fully_qualified_parent_url is not None:
         logging.debug(
             "with parent: %s",
-            gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url),
+            strip_control_chars(gr.fully_qualified_parent_url),
         )
     response = gr.fetch()
 
     if response is None:
         # problem before getting a response
-        logging.warn("Failed to fetch: %s", gus.lib.logging.strip_control_chars(url))
+        logging.warn("Failed to fetch: %s", strip_control_chars(url))
         page = index_error(gr, True, None)
         
         failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1
@@ -419,7 +420,7 @@ def crawl_page(
         # temporary error status
         logging.debug(
             "Got temporary error: %s: %s %s",
-            gus.lib.logging.strip_control_chars(url),
+            strip_control_chars(url),
             response.status,
             response.error_message,
         )
@@ -429,7 +430,7 @@ def crawl_page(
         # permanent error status
         logging.debug(
             "Got permanent error: %s: %s %s",
-            gus.lib.logging.strip_control_chars(url),
+            strip_control_chars(url),
             response.status,
             response.error_message,
         )
@@ -439,14 +440,14 @@ def crawl_page(
         # redirect status
         logging.debug(
             "Got redirected: %s: %s %s",
-            gus.lib.logging.strip_control_chars(url),
+            strip_control_chars(url),
             response.status,
             response.url,
         )
         if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH:
             logging.info(
                 "Aborting, maximum redirect chain length reached: %s",
-                gus.lib.logging.strip_control_chars(url),
+                strip_control_chars(url),
             )
             return
         redirect_resource = GeminiResource(
@@ -455,7 +456,7 @@ def crawl_page(
         if redirect_resource.fetchable_url == gr.fetchable_url:
             logging.info(
                 "Aborting, redirecting to self: %s",
-                gus.lib.logging.strip_control_chars(url),
+                strip_control_chars(url),
             )
             return
         page = index_redirect(gr, response)
@@ -470,7 +471,7 @@ def crawl_page(
         # input status
         logging.debug(
             "Input requested at: %s: %s %s",
-            gus.lib.logging.strip_control_chars(url),
+            strip_control_chars(url),
             response.status,
             response.prompt,
         )
@@ -479,7 +480,7 @@ def crawl_page(
         # success status
         logging.debug(
             "Successful request: %s: %s %s",
-            gus.lib.logging.strip_control_chars(url),
+            strip_control_chars(url),
             response.status,
             response.content_type,
         )
@@ -488,13 +489,13 @@ def crawl_page(
             if response.content_type != "text/gemini":
                 logging.debug(
                     "Content is not gemini text: %s: %s",
-                    gus.lib.logging.strip_control_chars(url),
+                    strip_control_chars(url),
                     response.content_type,
                 )
             else:
                 logging.debug(
                     "Got gemini text, extracting and crawling links: %s",
-                    gus.lib.logging.strip_control_chars(url),
+                    strip_control_chars(url),
                 )
                 contained_resources = gr.extract_contained_resources(response.content)
                 index_links(gr, contained_resources)
@@ -508,7 +509,7 @@ def crawl_page(
     else:
         logging.warn(
             "Got unhandled status: %s: %s",
-            gus.lib.logging.strip_control_chars(url),
+            strip_control_chars(url),
             response.status,
         )
 
@@ -519,7 +520,7 @@ def load_expired_urls():
              FROM page as p
              WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') OR last_crawl_at IS NULL""" )
     return [page.url for page in expired_pages.execute()]
-#    expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.now() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True))
+#    expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.utcnow() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True))
 #    return expired_pages
 
 def load_seed_request_urls():
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -144,6 +144,9 @@ EXCLUDED_URL_PREFIXES = [
     "gemini://chat.mozz.us/stream",
     "gemini://chat.mozz.us/submit",
 
+    # gempod
+    "gemini://rocketcaster.xyz/share/",
+
     # gopher proxy
     "gemini://80h.dev/agena/",
 
diff --git a/infra/rebuild_index.sh b/infra/rebuild_index.sh
@@ -1,8 +1,17 @@
-mkdir -p /home/gus/index.new/
+#!/bin/bash
+BASEDIR=${1}
+
+LOCKFILE=${BASEDIR}/crawl.lock
+
+touch ${LOCKFILE}
+
+mkdir -p ${BASEDIR}/index.new/
 sudo systemctl stop gus
-cp /home/gus/index/gus.sqlite /home/gus/index.new/
+cp ${BASEDIR}/index/gus.sqlite ${BASEDIR}/index.new/
 sudo systemctl start gus
-/home/gus/.poetry/bin/poetry run build_index -d
-mv /home/gus/index /home/gus/index.old
-mv /home/gus/index.new /home/gus/index
+${BASEDIR}/.poetry/bin/poetry run build_index -d
+mv ${BASEDIR}/index ${BASEDIR}/index.old
+mv ${BASEDIR}/index.new ${BASEDIR}/index
 sudo systemctl restart gus
+
+rm ${LOCKFILE}
diff --git a/infra/update_index.sh b/infra/update_index.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 BASEDIR=${1}
 
 LOCKFILE=${BASEDIR}/crawl.lock

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	.gitignore	\|	1	+
M	gus/crawl.py	\|	71	++++++++++++++++++++++++++++++++++++-----------------------------------
M	gus/excludes.py	\|	3	+++
M	infra/rebuild_index.sh	\|	19	++++++++++++++-----
M	infra/update_index.sh	\|	1	+