geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 511030699cd5280a0024885860e365137eb69200
parent 35ebd8b71ef112145b3ebc6604f8c2e4dd0365af
Author: René Wagner <rwa@clttr.info>
Date:   Wed,  7 Jun 2023 20:02:01 +0200

further simplify URI handling and always use lowercase host part

Diffstat:
Mgus/build_index.py | 1-
Mgus/crawl.py | 62++++++++++++++++++++++++++++----------------------------------
Mgus/lib/db_model.py | 3+--
Mgus/lib/gemini.py | 60+++++++++++++++++++++++++++---------------------------------
Mserve/models.py | 7+++----
Mserve/templates/documentation/indexing.gmi | 2+-
Mserve/templates/known_feeds.gmi | 2+-
Mserve/templates/newest_pages.gmi | 2+-
Mserve/templates/search.gmi | 2+-
Dserve/templates/threads.gmi | 27---------------------------
Mtests/gus/lib/test_gemini.py | 12+++++++-----
11 files changed, 70 insertions(+), 110 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -53,7 +53,6 @@ AND l.is_cross_host_like == 1""", document = { "url_id": page.url, "url": page.url, - "fetchable_url": page.fetchable_url, "domain": page.domain, "port": page.port, "content_type": page.content_type, diff --git a/gus/crawl.py b/gus/crawl.py @@ -37,12 +37,11 @@ EXCLUDED_URL_PATTERN = re.compile( def index_binary(resource, response): logging.debug( "Indexing binary for: %s", - strip_control_chars(resource.normalized_url), + strip_control_chars(resource.fetchable_url), ) doc = { - "url": resource.normalized_url, - "fetchable_url": resource.fetchable_url, + "url": resource.fetchable_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "content_type": response.content_type, @@ -56,7 +55,7 @@ def index_binary(resource, response): "last_status_message" : response.error_message, "first_seen_at" : datetime.utcnow() } - existing_page = Page.get_or_none(url=resource.normalized_url) + existing_page = Page.get_or_none(url=resource.fetchable_url) if existing_page: doc["id"] = existing_page.id if not (existing_page.first_seen_at is None): @@ -73,7 +72,7 @@ def index_binary(resource, response): try: page.save() except: - logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.fetchable_url)) return page @@ -81,12 +80,11 @@ def index_binary(resource, response): def index_redirect(resource, response): logging.debug( "Indexing redirect for: %s", - strip_control_chars(resource.normalized_url), + strip_control_chars(resource.fetchable_url), ) doc = { - "url": resource.normalized_url, - "fetchable_url": resource.fetchable_url, + "url": resource.fetchable_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "change_frequency": resource.get_default_change_frequency("redirect"), @@ -97,7 +95,7 @@ def index_redirect(resource, response): "last_status_message" : response.error_message, "first_seen_at" : datetime.utcnow() } - existing_page = Page.get_or_none(url=resource.normalized_url) + existing_page = Page.get_or_none(url=resource.fetchable_url) if existing_page: doc["id"] = existing_page.id if not (existing_page.first_seen_at is None): @@ -114,7 +112,7 @@ def index_redirect(resource, response): try: page.save() except: - logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.fetchable_url)) return page @@ -123,8 +121,7 @@ def index_error(resource, is_temporary, response): category = "temp_error" if is_temporary else "perm_error" default_change_frequency = resource.get_default_change_frequency(category) doc = { - "url": resource.normalized_url, - "fetchable_url": resource.fetchable_url, + "url": resource.fetchable_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "change_frequency": default_change_frequency, @@ -132,7 +129,7 @@ def index_error(resource, is_temporary, response): "last_status" : None if response is None else response.status, "last_status_message" : None if response is None else response.error_message } - existing_page = Page.get_or_none(url=resource.normalized_url) + existing_page = Page.get_or_none(url=resource.fetchable_url) if existing_page: doc["id"] = existing_page.id existing_change_frequency = ( @@ -145,7 +142,7 @@ def index_error(resource, is_temporary, response): try: page.save() except: - logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.fetchable_url)) return page @@ -153,12 +150,11 @@ def index_error(resource, is_temporary, response): def index_prompt(resource, response): logging.debug( "Indexing prompt for: %s", - strip_control_chars(resource.normalized_url), + strip_control_chars(resource.fetchable_url), ) doc = { - "url": resource.normalized_url, - "fetchable_url": resource.fetchable_url, + "url": resource.fetchable_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "content_type": "input", @@ -172,7 +168,7 @@ def index_prompt(resource, response): "last_status_message" : response.error_message, "first_seen_at" : datetime.utcnow() } - existing_page = Page.get_or_none(url=resource.normalized_url) + existing_page = Page.get_or_none(url=resource.fetchable_url) if existing_page: doc["id"] = existing_page.id if not (existing_page.first_seen_at is None): @@ -200,7 +196,7 @@ def index_prompt(resource, response): pagecontent = PageContent(**content) pagecontent.save() except: - logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url)) + logging.error("Error adding page: %s", strip_control_chars(resource.fetchable_url)) return page @@ -208,12 +204,11 @@ def index_prompt(resource, response): def index_content(resource, response): logging.debug( "Storing content for: %s", - strip_control_chars(resource.normalized_url), + strip_control_chars(resource.fetchable_url), ) doc = { - "url": resource.normalized_url, - "fetchable_url": resource.fetchable_url, + "url": resource.fetchable_url, "domain": resource.normalized_host, "port": resource.urlsplit.port or 1965, "content_type": response.content_type, @@ -229,7 +224,7 @@ def index_content(resource, response): } if response.content_type == "text/gemini": doc["lang"] = (response.lang or "none",) - existing_page = Page.get_or_none(url=resource.normalized_url) + existing_page = Page.get_or_none(url=resource.fetchable_url) is_different = False if existing_page: doc["id"] = existing_page.id @@ -266,7 +261,7 @@ def index_content(resource, response): pagecontent = PageContent(**content) pagecontent.save() except Exception as e: - logging.error("Error adding page %s: %s", strip_control_chars(resource.normalized_url), e) + logging.error("Error adding page %s: %s", strip_control_chars(resource.fetchable_url), e) return page, is_different @@ -275,14 +270,14 @@ def should_skip(resource): should_skip = False try: for excluded_prefix in EXCLUDED_URL_PREFIXES: - if resource.normalized_url.startswith(excluded_prefix): + if resource.fetchable_url.startswith(excluded_prefix): should_skip = True break for excluded_path in EXCLUDED_URL_PATHS: if resource.urlsplit.path.lower().endswith(excluded_path): should_skip = True break - m = EXCLUDED_URL_PATTERN.match(resource.normalized_url) + m = EXCLUDED_URL_PATTERN.match(resource.fetchable_url) if m: should_skip = True except: @@ -293,7 +288,7 @@ def should_skip(resource): def index_links(from_resource, contained_resources): - from_page, created = Page.get_or_create(url=from_resource.normalized_url) + from_page, created = Page.get_or_create(url=from_resource.fetchable_url) ## first delete all links that this page as had before ## than add new links @@ -305,11 +300,10 @@ def index_links(from_resource, contained_resources): for cr in contained_resources: if should_skip(cr): continue - to_page = Page.get_or_none(url=cr.normalized_url) + to_page = Page.get_or_none(url=cr.fetchable_url) if not to_page: to_page = Page.create( - url=cr.normalized_url, - fetchable_url=cr.fetchable_url, + url=cr.fetchable_url, domain=cr.normalized_host, port=cr.urlsplit.port or 1965, first_seen_at=datetime.utcnow() @@ -371,7 +365,7 @@ def crawl_page( ) return - existing_page = Page.get_or_none(url=gr.normalized_url) + existing_page = Page.get_or_none(url=gr.fetchable_url) if existing_page and existing_page.change_frequency is not None: most_recent_crawl = existing_page.last_crawl_at if most_recent_crawl and datetime.utcnow() < most_recent_crawl + timedelta( @@ -386,10 +380,10 @@ def crawl_page( robots_file = get_robots_file(gr.normalized_host) crawl_delay = None if robots_file is not None: - logging.debug("Found robots.txt for %s", gr.normalized_url) + logging.debug("Found robots.txt for %s", gr.fetchable_url) # only fetch if allowed for a matching user-agent: # in priority order "gus" > "indexer" > "*" - can_fetch = robots_file.can_fetch_prioritized(["gus", "indexer", "*"], gr.normalized_url) + can_fetch = robots_file.can_fetch_prioritized(["gus", "indexer", "*"], gr.fetchable_url) # same approach as above - last value wins crawl_delay = robots_file.crawl_delay("indexer") @@ -473,7 +467,7 @@ def crawl_page( ) return redirect_resource = GeminiResource( - response.url, gr.normalized_url, gr.normalized_host + response.url, gr.fetchable_url, gr.normalized_host ) if redirect_resource.fetchable_url == gr.fetchable_url: logging.info( diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -35,7 +35,6 @@ class Page(Model): """ url = TextField(unique=True, index=True) - fetchable_url = TextField(null=True) domain = TextField(null=True, index=True) port = IntegerField(null=True) content_type = TextField(null=True) @@ -53,7 +52,7 @@ class Page(Model): first_seen_at = DateTimeField(null=True) class Meta: indexes=( - (('last_success_status', 'first_seen_at', 'indexed_at', 'domain', 'url', 'content_type', 'fetchable_url'), False), + (('last_success_status', 'first_seen_at', 'indexed_at', 'domain', 'url', 'content_type'), False), (('last_crawl_at', 'last_crawl_success_at'), False) ) diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -55,7 +55,7 @@ AUTHOR_CONTENT_PATTERN = re.compile( r".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE ) -TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE) +TITLE_CONTENT_PATTERN = re.compile(r"^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE) TITLE_URL_PATTERN = re.compile( r".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", flags=re.IGNORECASE, @@ -181,20 +181,18 @@ class GeminiResource: if not self.is_valid: return None if self._normalized_url is None: - ( - self._normalized_url, - self._normalized_host, - ) = self._get_normalized_url_and_host() + url_normalized = unquote(self.fetchable_url) + self._normalized_url = url_normalized.replace( + self.urlsplit.hostname + ":1965", + self.urlsplit.hostname.lower(), + 1) return self._normalized_url def _get_normalized_host(self): if not self.is_valid: return None if self._normalized_host is None: - ( - self._normalized_url, - self._normalized_host, - ) = self._get_normalized_url_and_host() + self._normalized_host = self.urlsplit.hostname.lower() return self._normalized_host def _get_normalized_host_like(self): @@ -231,7 +229,10 @@ class GeminiResource: url = url + "/" if self.urlsplit[4] != "": url = url.replace("#{}".format(self.urlsplit[4]), "") - self._fetchable_url = url + + self._fetchable_url = url.replace( + self.urlsplit.hostname + ":1965", + self.urlsplit.hostname.lower(), 1) return self._fetchable_url def _get_is_root_like(self): @@ -371,20 +372,6 @@ class GeminiResource: else: raise Exception.NameError("Unrecognized resource category") - # constructed from fetchable_url - # does not matter if quoted or unquoted so I choose arbitrarily to - # standardize on unquoting it. - normalized_url = property(_get_normalized_url) - normalized_host = property(_get_normalized_host) - # constructed from urlsplit or raw_url - # should be quoted. - fetchable_url = property(_get_fetchable_url) - # constructed from fetchable_url - # should be unquoted. - is_root_like = property(_get_is_root_like) - is_log_root_like = property(_get_is_log_root_like) - is_log_post_like = property(_get_is_log_post_like) - normalized_host_like = property(_get_normalized_host_like) def fetch(self): # NB: this intentionally does NOT fetch the normalized URL, because that could @@ -392,15 +379,6 @@ class GeminiResource: # and a server redirecting to the same URL _with_ a trailing slash. return gusmobile.fetch(self.fetchable_url) - def _get_normalized_url_and_host(self): - url_normalized = unquote(self.fetchable_url.rstrip("/")) - url_normalized = url_normalized.replace( - self.urlsplit.hostname.lower() + ":1965", - self.urlsplit.hostname.lower(), - 1, - ) - host_normalized = self.urlsplit.hostname.lower() - return url_normalized, host_normalized def extract_contained_resources(self, content): # this finds all gemini URLs within the content of a given GeminiResource and @@ -428,3 +406,19 @@ class GeminiResource: self.contained_resources = resources return self.contained_resources + + # constructed from fetchable_url + # does not matter if quoted or unquoted so I choose arbitrarily to + # standardize on unquoting it. + #normalized_url = property(_get_normalized_url) + normalized_host = property(_get_normalized_host) + # constructed from urlsplit or raw_url + # should be quoted. + fetchable_url = property(_get_fetchable_url) + # constructed from fetchable_url + # should be unquoted. + is_root_like = property(_get_is_root_like) + is_log_root_like = property(_get_is_log_root_like) + is_log_post_like = property(_get_is_log_post_like) + normalized_host_like = property(_get_normalized_host_like) + diff --git a/serve/models.py b/serve/models.py @@ -45,7 +45,7 @@ class GUS: self.newest_hosts = newest_hosts_query.execute() newest_pages_query = Page.raw( - """SELECT p.url, p.fetchable_url, p.first_seen_at FROM page as p + """SELECT p.url, p.first_seen_at FROM page as p WHERE last_success_status = 20 AND first_seen_at IS NOT NULL ORDER BY first_seen_at DESC @@ -53,7 +53,7 @@ class GUS: self.newest_pages = newest_pages_query.execute() feeds_query = Page.raw( - """SELECT DISTINCT p.url, p.fetchable_url + """SELECT DISTINCT p.url FROM page AS p WHERE last_success_status = 20 AND (p.url LIKE '%atom.xml' @@ -78,7 +78,6 @@ class GUS: "score": result.score, "indexed_at": result["indexed_at"], "url": result["url"], - "fetchable_url": result["fetchable_url"], "content_type": result["content_type"], "charset": result["charset"] if "charset" in result else "none", "size": result["size"] if "size" in result else 0, @@ -124,7 +123,7 @@ class GUS: ) else: link_text = "{} ({}, {})".format( - result["fetchable_url"][9:], + result["url"][9:], result["content_type"], bytes2human(result["size"], format="%(value).0f%(symbol)s"), ) diff --git a/serve/templates/documentation/indexing.gmi b/serve/templates/documentation/indexing.gmi @@ -20,7 +20,7 @@ Please note that there are provisions in place for manually excluding content fr Currently, especially content of the following types is excluded: - mirrors of large websites like Wikipedia or the Go-docs (it's just to much to add it to the index in the current state) -- mirrors of news sites from the common web (too big and to frequent changes) +- mirrors of news sites from the common web (too big and too frequent changes) ### Indexing and Redirects geminispace.info checks for specific return codes like 31 PERMANENT REDIRECT and will save this information. diff --git a/serve/templates/known_feeds.gmi b/serve/templates/known_feeds.gmi @@ -6,7 +6,7 @@ Below are the public feeds of which geminispace.info is aware. This list is auto-generated from the index, so if your feed is not showing up here, please use the link at the bottom of this page to submit a crawl request for your feed URL. {% for feed in known_feeds %} -{{ "=> {} {}".format(feed.fetchable_url, feed.url) }} +{{ "=> {} {}".format(feed.url, feed.url) }} {% endfor %} {% include 'fragments/footer.gmi' %} diff --git a/serve/templates/newest_pages.gmi b/serve/templates/newest_pages.gmi @@ -6,7 +6,7 @@ Here are the fifty most recently discovered Gemini pages by geminispace.info. {% for page in newest_pages %} -{{ "=> {} {}: {}".format(page.fetchable_url, page.first_seen_at.strftime('%Y-%m-%d'), page.url) }} +{{ "=> {} {}: {}".format(page.url, page.first_seen_at.strftime('%Y-%m-%d'), page.url) }} {% endfor %} {% include 'fragments/footer.gmi' %} diff --git a/serve/templates/search.gmi b/serve/templates/search.gmi @@ -12,7 +12,7 @@ "{{ query }}" {% for result in results %} -=> {{ result["fetchable_url"] }} {{ result["link_text"] }} +=> {{ result["url"] }} {{ result["link_text"] }} {% if result["backlink_count"] > 1 %} => /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlink_count"] }} cross-capsule backlinks {% elif result["backlink_count"] > 0 %} diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi @@ -1,27 +0,0 @@ -{% include 'fragments/header.gmi' %} - - -## Threads - -{% if sort == "recency" %} -### Sort -Most recent (current) -=> /threads?length Switch to longest -{% elif sort == "length" %} -### Sort -Longest (current) -=> /threads?recency Switch to most recent -{% endif %} - -{% for date in threads %} -### {{ date["date"] | datetimeformat("%Y, %b %d") }} - -{% for thread in date["threads"] %} -{% for member in thread["members"] %} -=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["friendly_author"] }} - {{ member["friendly_title"] }} -{% endfor %} - -{% endfor %} - -{% endfor %} -{% include 'fragments/footer.gmi' %} diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py @@ -43,23 +43,25 @@ text assert resources[1].raw_url == "other" @pytest.mark.parametrize("test_input,expected_result", [ - (["gemini://gus.guru", None, None], [True, "gemini://gus.guru", "gus.guru", "gemini://gus.guru/"]), + (["gemini://gus.guru", None, None], [True, "gemini://gus.guru/", "gus.guru", "gemini://gus.guru/"]), (["/bar", "gemini://gus.guru/foo", None], [False, None, None, None]), (["/bar", "gemini://gus.guru/foo/", None], [False, None, None, None]), (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]), (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]), (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]), - (["bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/foo/bar", "gus.guru", "gemini://gus.guru/foo/bar"]), - (["//foo.com", None, None], [True, "gemini://foo.com", "foo.com", "gemini://foo.com/"]), + (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/foo/bar/", "gus.guru", "gemini://gus.guru/foo/bar/"]), + (["//foo.com", None, None], [True, "gemini://foo.com/", "foo.com", "gemini://foo.com/"]), (["gemini://gem.splatt9990.com/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"] ), - (["gemini://gem.splatt9990.com:1965/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com:1965/index.gmi"]), + (["gemini://gem.splatt9990.com:1965/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"]), (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gemini://gem.splatt9990.com:1966/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi"]), + (["gemini://michaelnordmeyer.com", None, None], [True, "gemini://michaelnordmeyer.com/", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/"]), + (["log.gmi", "gemini://michaelnordmeyer.com/", None], [True, "gemini://michaelnordmeyer.com/log.gmi", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi"]), + (["Log.gmi", "gemini://michaelnordmeyer.com/", None], [True, "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi"]), ]) def test_url_parsing(self, test_input, expected_result): gr = GeminiResource(test_input[0], test_input[1], test_input[2]) assert gr.is_valid == expected_result[0] - assert gr.normalized_url == expected_result[1] assert gr.normalized_host == expected_result[2] assert gr.fetchable_url == expected_result[3]