geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 16eb249aa8f40774ef3613644ac8d7193d08f925
parent f4cbbc207f78f4be31259e6a0d5cc9859533c46d
Author: René Wagner <rwa@clttr.info>
Date:   Fri, 10 Feb 2023 08:44:46 +0100

excludes, cosmetics, infra fixes

Diffstat:
M.gitignore | 1+
Mgus/crawl.py | 71++++++++++++++++++++++++++++++++++++-----------------------------------
Mgus/excludes.py | 3+++
Minfra/rebuild_index.sh | 19++++++++++++++-----
Minfra/update_index.sh | 1+
5 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -150,3 +150,4 @@ dmypy.json .viminfo .vimrc .profile +crawl.lock diff --git a/gus/crawl.py b/gus/crawl.py @@ -17,6 +17,7 @@ from . import constants from gus.lib.db_model import init_db, Page, Link from gus.lib.gemini import GeminiResource, GeminiRobotFileParser import gus.lib.logging +from gus.lib.logging import strip_control_chars # hack: the built-in methods in urllib need to know the # Gemini protocol exists @@ -36,7 +37,7 @@ EXCLUDED_URL_PATTERN = re.compile( def index_binary(resource, response): logging.debug( "Indexing binary for: %s", - gus.lib.logging.strip_control_chars(resource.normalized_url), + strip_control_chars(resource.normalized_url), ) doc = { @@ -72,7 +73,7 @@ def index_binary(resource, response): try: page.save() except: - logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) return page @@ -80,7 +81,7 @@ def index_binary(resource, response): def index_redirect(resource, response): logging.debug( "Indexing redirect for: %s", - gus.lib.logging.strip_control_chars(resource.normalized_url), + strip_control_chars(resource.normalized_url), ) doc = { @@ -113,7 +114,7 @@ def index_redirect(resource, response): try: page.save() except: - logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) return page @@ -144,7 +145,7 @@ def index_error(resource, is_temporary, response): try: page.save() except: - logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) return page @@ -152,7 +153,7 @@ def index_error(resource, is_temporary, response): def index_prompt(resource, response): logging.debug( "Indexing prompt for: %s", - gus.lib.logging.strip_control_chars(resource.normalized_url), + strip_control_chars(resource.normalized_url), ) doc = { @@ -189,7 +190,7 @@ def index_prompt(resource, response): try: page.save() except: - logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) return page @@ -197,7 +198,7 @@ def index_prompt(resource, response): def index_content(resource, response): logging.debug( "Indexing content for: %s", - gus.lib.logging.strip_control_chars(resource.normalized_url), + strip_control_chars(resource.normalized_url), ) doc = { @@ -243,7 +244,7 @@ def index_content(resource, response): try: page.save() except: - logging.error("Error adding page: %s", gus.lib.logging.strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) return page, is_different @@ -263,7 +264,7 @@ def should_skip(resource): if m: should_skip = True except: - logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.raw_url)) + logging.error("Error checking for exclude of url: %s", strip_control_chars(resource.raw_url)) should_skip = True return should_skip @@ -289,7 +290,7 @@ def index_links(from_resource, contained_resources): fetchable_url=cr.fetchable_url, domain=cr.normalized_host, port=cr.urlsplit.port or 1965, - first_seen_at=datetime.now() + first_seen_at=datetime.utcnow() ) data.append( { @@ -307,7 +308,7 @@ def index_links(from_resource, contained_resources): def fetch_robots_file(robot_host): robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt") logging.info( - "Fetching robots file: %s", gus.lib.logging.strip_control_chars(robot_url) + "Fetching robots file: %s", strip_control_chars(robot_url) ) rp = GeminiRobotFileParser(robot_url) rp.read() @@ -328,34 +329,34 @@ def crawl_page( if not gemini_resource.is_valid: logging.warn( "Not a valid gemini resource, skipping: %s", - gus.lib.logging.strip_control_chars(gemini_resource.url), + strip_control_chars(gemini_resource.url), ) return if max_crawl_depth >= 0 and current_depth > max_crawl_depth: logging.warn( - "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url) + "Going too deep, skipping: %s", strip_control_chars(url) ) return if should_skip(gr): logging.debug( "URL is excluded, skipping: %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), ) return if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT: logging.debug( - "Too many failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url) + "Too many failed requests for host, skipping: %s", strip_control_chars(url) ) return existing_page = Page.get_or_none(url=gr.normalized_url) if existing_page and existing_page.change_frequency is not None: most_recent_crawl = existing_page.last_crawl_at - if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta( + if most_recent_crawl and datetime.utcnow() < most_recent_crawl + timedelta( hours=existing_page.change_frequency): logging.debug( "Too soon to recrawl, skipping: %s", - gus.lib.logging.strip_control_chars(gr.fetchable_url), + strip_control_chars(gr.fetchable_url), ) return @@ -374,7 +375,7 @@ def crawl_page( if not can_fetch: logging.debug( "Blocked by robots.txt, skipping: %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), ) return @@ -392,22 +393,22 @@ def crawl_page( next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta( milliseconds=crawl_delay ) - sleep_duration = max((next_allowed_hit - datetime.now()).total_seconds(), 0) + sleep_duration = max((next_allowed_hit - datetime.utcnow()).total_seconds(), 0) time.sleep(sleep_duration) - domain_hit_timings[gr.normalized_host] = datetime.now() + domain_hit_timings[gr.normalized_host] = datetime.utcnow() # Actually fetch! - logging.info("Fetching resource: %s", gus.lib.logging.strip_control_chars(url)) + logging.info("Fetching resource: %s", strip_control_chars(url)) if gr.fully_qualified_parent_url is not None: logging.debug( "with parent: %s", - gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url), + strip_control_chars(gr.fully_qualified_parent_url), ) response = gr.fetch() if response is None: # problem before getting a response - logging.warn("Failed to fetch: %s", gus.lib.logging.strip_control_chars(url)) + logging.warn("Failed to fetch: %s", strip_control_chars(url)) page = index_error(gr, True, None) failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1 @@ -419,7 +420,7 @@ def crawl_page( # temporary error status logging.debug( "Got temporary error: %s: %s %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), response.status, response.error_message, ) @@ -429,7 +430,7 @@ def crawl_page( # permanent error status logging.debug( "Got permanent error: %s: %s %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), response.status, response.error_message, ) @@ -439,14 +440,14 @@ def crawl_page( # redirect status logging.debug( "Got redirected: %s: %s %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), response.status, response.url, ) if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH: logging.info( "Aborting, maximum redirect chain length reached: %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), ) return redirect_resource = GeminiResource( @@ -455,7 +456,7 @@ def crawl_page( if redirect_resource.fetchable_url == gr.fetchable_url: logging.info( "Aborting, redirecting to self: %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), ) return page = index_redirect(gr, response) @@ -470,7 +471,7 @@ def crawl_page( # input status logging.debug( "Input requested at: %s: %s %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), response.status, response.prompt, ) @@ -479,7 +480,7 @@ def crawl_page( # success status logging.debug( "Successful request: %s: %s %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), response.status, response.content_type, ) @@ -488,13 +489,13 @@ def crawl_page( if response.content_type != "text/gemini": logging.debug( "Content is not gemini text: %s: %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), response.content_type, ) else: logging.debug( "Got gemini text, extracting and crawling links: %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), ) contained_resources = gr.extract_contained_resources(response.content) index_links(gr, contained_resources) @@ -508,7 +509,7 @@ def crawl_page( else: logging.warn( "Got unhandled status: %s: %s", - gus.lib.logging.strip_control_chars(url), + strip_control_chars(url), response.status, ) @@ -519,7 +520,7 @@ def load_expired_urls(): FROM page as p WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') OR last_crawl_at IS NULL""" ) return [page.url for page in expired_pages.execute()] -# expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.now() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True)) +# expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.utcnow() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True)) # return expired_pages def load_seed_request_urls(): diff --git a/gus/excludes.py b/gus/excludes.py @@ -144,6 +144,9 @@ EXCLUDED_URL_PREFIXES = [ "gemini://chat.mozz.us/stream", "gemini://chat.mozz.us/submit", + # gempod + "gemini://rocketcaster.xyz/share/", + # gopher proxy "gemini://80h.dev/agena/", diff --git a/infra/rebuild_index.sh b/infra/rebuild_index.sh @@ -1,8 +1,17 @@ -mkdir -p /home/gus/index.new/ +#!/bin/bash +BASEDIR=${1} + +LOCKFILE=${BASEDIR}/crawl.lock + +touch ${LOCKFILE} + +mkdir -p ${BASEDIR}/index.new/ sudo systemctl stop gus -cp /home/gus/index/gus.sqlite /home/gus/index.new/ +cp ${BASEDIR}/index/gus.sqlite ${BASEDIR}/index.new/ sudo systemctl start gus -/home/gus/.poetry/bin/poetry run build_index -d -mv /home/gus/index /home/gus/index.old -mv /home/gus/index.new /home/gus/index +${BASEDIR}/.poetry/bin/poetry run build_index -d +mv ${BASEDIR}/index ${BASEDIR}/index.old +mv ${BASEDIR}/index.new ${BASEDIR}/index sudo systemctl restart gus + +rm ${LOCKFILE} diff --git a/infra/update_index.sh b/infra/update_index.sh @@ -1,3 +1,4 @@ +#!/bin/bash BASEDIR=${1} LOCKFILE=${BASEDIR}/crawl.lock