commit 511030699cd5280a0024885860e365137eb69200
parent 35ebd8b71ef112145b3ebc6604f8c2e4dd0365af
Author: René Wagner <rwa@clttr.info>
Date: Wed, 7 Jun 2023 20:02:01 +0200
further simplify URI handling and always use lowercase host part
Diffstat:
11 files changed, 70 insertions(+), 110 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -53,7 +53,6 @@ AND l.is_cross_host_like == 1""",
document = {
"url_id": page.url,
"url": page.url,
- "fetchable_url": page.fetchable_url,
"domain": page.domain,
"port": page.port,
"content_type": page.content_type,
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -37,12 +37,11 @@ EXCLUDED_URL_PATTERN = re.compile(
def index_binary(resource, response):
logging.debug(
"Indexing binary for: %s",
- strip_control_chars(resource.normalized_url),
+ strip_control_chars(resource.fetchable_url),
)
doc = {
- "url": resource.normalized_url,
- "fetchable_url": resource.fetchable_url,
+ "url": resource.fetchable_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"content_type": response.content_type,
@@ -56,7 +55,7 @@ def index_binary(resource, response):
"last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
- existing_page = Page.get_or_none(url=resource.normalized_url)
+ existing_page = Page.get_or_none(url=resource.fetchable_url)
if existing_page:
doc["id"] = existing_page.id
if not (existing_page.first_seen_at is None):
@@ -73,7 +72,7 @@ def index_binary(resource, response):
try:
page.save()
except:
- logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.fetchable_url))
return page
@@ -81,12 +80,11 @@ def index_binary(resource, response):
def index_redirect(resource, response):
logging.debug(
"Indexing redirect for: %s",
- strip_control_chars(resource.normalized_url),
+ strip_control_chars(resource.fetchable_url),
)
doc = {
- "url": resource.normalized_url,
- "fetchable_url": resource.fetchable_url,
+ "url": resource.fetchable_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"change_frequency": resource.get_default_change_frequency("redirect"),
@@ -97,7 +95,7 @@ def index_redirect(resource, response):
"last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
- existing_page = Page.get_or_none(url=resource.normalized_url)
+ existing_page = Page.get_or_none(url=resource.fetchable_url)
if existing_page:
doc["id"] = existing_page.id
if not (existing_page.first_seen_at is None):
@@ -114,7 +112,7 @@ def index_redirect(resource, response):
try:
page.save()
except:
- logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.fetchable_url))
return page
@@ -123,8 +121,7 @@ def index_error(resource, is_temporary, response):
category = "temp_error" if is_temporary else "perm_error"
default_change_frequency = resource.get_default_change_frequency(category)
doc = {
- "url": resource.normalized_url,
- "fetchable_url": resource.fetchable_url,
+ "url": resource.fetchable_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"change_frequency": default_change_frequency,
@@ -132,7 +129,7 @@ def index_error(resource, is_temporary, response):
"last_status" : None if response is None else response.status,
"last_status_message" : None if response is None else response.error_message
}
- existing_page = Page.get_or_none(url=resource.normalized_url)
+ existing_page = Page.get_or_none(url=resource.fetchable_url)
if existing_page:
doc["id"] = existing_page.id
existing_change_frequency = (
@@ -145,7 +142,7 @@ def index_error(resource, is_temporary, response):
try:
page.save()
except:
- logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.fetchable_url))
return page
@@ -153,12 +150,11 @@ def index_error(resource, is_temporary, response):
def index_prompt(resource, response):
logging.debug(
"Indexing prompt for: %s",
- strip_control_chars(resource.normalized_url),
+ strip_control_chars(resource.fetchable_url),
)
doc = {
- "url": resource.normalized_url,
- "fetchable_url": resource.fetchable_url,
+ "url": resource.fetchable_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"content_type": "input",
@@ -172,7 +168,7 @@ def index_prompt(resource, response):
"last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
- existing_page = Page.get_or_none(url=resource.normalized_url)
+ existing_page = Page.get_or_none(url=resource.fetchable_url)
if existing_page:
doc["id"] = existing_page.id
if not (existing_page.first_seen_at is None):
@@ -200,7 +196,7 @@ def index_prompt(resource, response):
pagecontent = PageContent(**content)
pagecontent.save()
except:
- logging.error("Error adding page: %s", strip_control_chars(resource.normalized_url))
+ logging.error("Error adding page: %s", strip_control_chars(resource.fetchable_url))
return page
@@ -208,12 +204,11 @@ def index_prompt(resource, response):
def index_content(resource, response):
logging.debug(
"Storing content for: %s",
- strip_control_chars(resource.normalized_url),
+ strip_control_chars(resource.fetchable_url),
)
doc = {
- "url": resource.normalized_url,
- "fetchable_url": resource.fetchable_url,
+ "url": resource.fetchable_url,
"domain": resource.normalized_host,
"port": resource.urlsplit.port or 1965,
"content_type": response.content_type,
@@ -229,7 +224,7 @@ def index_content(resource, response):
}
if response.content_type == "text/gemini":
doc["lang"] = (response.lang or "none",)
- existing_page = Page.get_or_none(url=resource.normalized_url)
+ existing_page = Page.get_or_none(url=resource.fetchable_url)
is_different = False
if existing_page:
doc["id"] = existing_page.id
@@ -266,7 +261,7 @@ def index_content(resource, response):
pagecontent = PageContent(**content)
pagecontent.save()
except Exception as e:
- logging.error("Error adding page %s: %s", strip_control_chars(resource.normalized_url), e)
+ logging.error("Error adding page %s: %s", strip_control_chars(resource.fetchable_url), e)
return page, is_different
@@ -275,14 +270,14 @@ def should_skip(resource):
should_skip = False
try:
for excluded_prefix in EXCLUDED_URL_PREFIXES:
- if resource.normalized_url.startswith(excluded_prefix):
+ if resource.fetchable_url.startswith(excluded_prefix):
should_skip = True
break
for excluded_path in EXCLUDED_URL_PATHS:
if resource.urlsplit.path.lower().endswith(excluded_path):
should_skip = True
break
- m = EXCLUDED_URL_PATTERN.match(resource.normalized_url)
+ m = EXCLUDED_URL_PATTERN.match(resource.fetchable_url)
if m:
should_skip = True
except:
@@ -293,7 +288,7 @@ def should_skip(resource):
def index_links(from_resource, contained_resources):
- from_page, created = Page.get_or_create(url=from_resource.normalized_url)
+ from_page, created = Page.get_or_create(url=from_resource.fetchable_url)
## first delete all links that this page as had before
## than add new links
@@ -305,11 +300,10 @@ def index_links(from_resource, contained_resources):
for cr in contained_resources:
if should_skip(cr):
continue
- to_page = Page.get_or_none(url=cr.normalized_url)
+ to_page = Page.get_or_none(url=cr.fetchable_url)
if not to_page:
to_page = Page.create(
- url=cr.normalized_url,
- fetchable_url=cr.fetchable_url,
+ url=cr.fetchable_url,
domain=cr.normalized_host,
port=cr.urlsplit.port or 1965,
first_seen_at=datetime.utcnow()
@@ -371,7 +365,7 @@ def crawl_page(
)
return
- existing_page = Page.get_or_none(url=gr.normalized_url)
+ existing_page = Page.get_or_none(url=gr.fetchable_url)
if existing_page and existing_page.change_frequency is not None:
most_recent_crawl = existing_page.last_crawl_at
if most_recent_crawl and datetime.utcnow() < most_recent_crawl + timedelta(
@@ -386,10 +380,10 @@ def crawl_page(
robots_file = get_robots_file(gr.normalized_host)
crawl_delay = None
if robots_file is not None:
- logging.debug("Found robots.txt for %s", gr.normalized_url)
+ logging.debug("Found robots.txt for %s", gr.fetchable_url)
# only fetch if allowed for a matching user-agent:
# in priority order "gus" > "indexer" > "*"
- can_fetch = robots_file.can_fetch_prioritized(["gus", "indexer", "*"], gr.normalized_url)
+ can_fetch = robots_file.can_fetch_prioritized(["gus", "indexer", "*"], gr.fetchable_url)
# same approach as above - last value wins
crawl_delay = robots_file.crawl_delay("indexer")
@@ -473,7 +467,7 @@ def crawl_page(
)
return
redirect_resource = GeminiResource(
- response.url, gr.normalized_url, gr.normalized_host
+ response.url, gr.fetchable_url, gr.normalized_host
)
if redirect_resource.fetchable_url == gr.fetchable_url:
logging.info(
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -35,7 +35,6 @@ class Page(Model):
"""
url = TextField(unique=True, index=True)
- fetchable_url = TextField(null=True)
domain = TextField(null=True, index=True)
port = IntegerField(null=True)
content_type = TextField(null=True)
@@ -53,7 +52,7 @@ class Page(Model):
first_seen_at = DateTimeField(null=True)
class Meta:
indexes=(
- (('last_success_status', 'first_seen_at', 'indexed_at', 'domain', 'url', 'content_type', 'fetchable_url'), False),
+ (('last_success_status', 'first_seen_at', 'indexed_at', 'domain', 'url', 'content_type'), False),
(('last_crawl_at', 'last_crawl_success_at'), False)
)
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -55,7 +55,7 @@ AUTHOR_CONTENT_PATTERN = re.compile(
r".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE
)
-TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE)
+TITLE_CONTENT_PATTERN = re.compile(r"^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE)
TITLE_URL_PATTERN = re.compile(
r".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$",
flags=re.IGNORECASE,
@@ -181,20 +181,18 @@ class GeminiResource:
if not self.is_valid:
return None
if self._normalized_url is None:
- (
- self._normalized_url,
- self._normalized_host,
- ) = self._get_normalized_url_and_host()
+ url_normalized = unquote(self.fetchable_url)
+ self._normalized_url = url_normalized.replace(
+ self.urlsplit.hostname + ":1965",
+ self.urlsplit.hostname.lower(),
+ 1)
return self._normalized_url
def _get_normalized_host(self):
if not self.is_valid:
return None
if self._normalized_host is None:
- (
- self._normalized_url,
- self._normalized_host,
- ) = self._get_normalized_url_and_host()
+ self._normalized_host = self.urlsplit.hostname.lower()
return self._normalized_host
def _get_normalized_host_like(self):
@@ -231,7 +229,10 @@ class GeminiResource:
url = url + "/"
if self.urlsplit[4] != "":
url = url.replace("#{}".format(self.urlsplit[4]), "")
- self._fetchable_url = url
+
+ self._fetchable_url = url.replace(
+ self.urlsplit.hostname + ":1965",
+ self.urlsplit.hostname.lower(), 1)
return self._fetchable_url
def _get_is_root_like(self):
@@ -371,20 +372,6 @@ class GeminiResource:
else:
raise Exception.NameError("Unrecognized resource category")
- # constructed from fetchable_url
- # does not matter if quoted or unquoted so I choose arbitrarily to
- # standardize on unquoting it.
- normalized_url = property(_get_normalized_url)
- normalized_host = property(_get_normalized_host)
- # constructed from urlsplit or raw_url
- # should be quoted.
- fetchable_url = property(_get_fetchable_url)
- # constructed from fetchable_url
- # should be unquoted.
- is_root_like = property(_get_is_root_like)
- is_log_root_like = property(_get_is_log_root_like)
- is_log_post_like = property(_get_is_log_post_like)
- normalized_host_like = property(_get_normalized_host_like)
def fetch(self):
# NB: this intentionally does NOT fetch the normalized URL, because that could
@@ -392,15 +379,6 @@ class GeminiResource:
# and a server redirecting to the same URL _with_ a trailing slash.
return gusmobile.fetch(self.fetchable_url)
- def _get_normalized_url_and_host(self):
- url_normalized = unquote(self.fetchable_url.rstrip("/"))
- url_normalized = url_normalized.replace(
- self.urlsplit.hostname.lower() + ":1965",
- self.urlsplit.hostname.lower(),
- 1,
- )
- host_normalized = self.urlsplit.hostname.lower()
- return url_normalized, host_normalized
def extract_contained_resources(self, content):
# this finds all gemini URLs within the content of a given GeminiResource and
@@ -428,3 +406,19 @@ class GeminiResource:
self.contained_resources = resources
return self.contained_resources
+
+ # constructed from fetchable_url
+ # does not matter if quoted or unquoted so I choose arbitrarily to
+ # standardize on unquoting it.
+ #normalized_url = property(_get_normalized_url)
+ normalized_host = property(_get_normalized_host)
+ # constructed from urlsplit or raw_url
+ # should be quoted.
+ fetchable_url = property(_get_fetchable_url)
+ # constructed from fetchable_url
+ # should be unquoted.
+ is_root_like = property(_get_is_root_like)
+ is_log_root_like = property(_get_is_log_root_like)
+ is_log_post_like = property(_get_is_log_post_like)
+ normalized_host_like = property(_get_normalized_host_like)
+
diff --git a/serve/models.py b/serve/models.py
@@ -45,7 +45,7 @@ class GUS:
self.newest_hosts = newest_hosts_query.execute()
newest_pages_query = Page.raw(
- """SELECT p.url, p.fetchable_url, p.first_seen_at FROM page as p
+ """SELECT p.url, p.first_seen_at FROM page as p
WHERE last_success_status = 20
AND first_seen_at IS NOT NULL
ORDER BY first_seen_at DESC
@@ -53,7 +53,7 @@ class GUS:
self.newest_pages = newest_pages_query.execute()
feeds_query = Page.raw(
- """SELECT DISTINCT p.url, p.fetchable_url
+ """SELECT DISTINCT p.url
FROM page AS p
WHERE last_success_status = 20
AND (p.url LIKE '%atom.xml'
@@ -78,7 +78,6 @@ class GUS:
"score": result.score,
"indexed_at": result["indexed_at"],
"url": result["url"],
- "fetchable_url": result["fetchable_url"],
"content_type": result["content_type"],
"charset": result["charset"] if "charset" in result else "none",
"size": result["size"] if "size" in result else 0,
@@ -124,7 +123,7 @@ class GUS:
)
else:
link_text = "{} ({}, {})".format(
- result["fetchable_url"][9:],
+ result["url"][9:],
result["content_type"],
bytes2human(result["size"], format="%(value).0f%(symbol)s"),
)
diff --git a/serve/templates/documentation/indexing.gmi b/serve/templates/documentation/indexing.gmi
@@ -20,7 +20,7 @@ Please note that there are provisions in place for manually excluding content fr
Currently, especially content of the following types is excluded:
- mirrors of large websites like Wikipedia or the Go-docs (it's just to much to add it to the index in the current state)
-- mirrors of news sites from the common web (too big and to frequent changes)
+- mirrors of news sites from the common web (too big and too frequent changes)
### Indexing and Redirects
geminispace.info checks for specific return codes like 31 PERMANENT REDIRECT and will save this information.
diff --git a/serve/templates/known_feeds.gmi b/serve/templates/known_feeds.gmi
@@ -6,7 +6,7 @@
Below are the public feeds of which geminispace.info is aware. This list is auto-generated from the index, so if your feed is not showing up here, please use the link at the bottom of this page to submit a crawl request for your feed URL.
{% for feed in known_feeds %}
-{{ "=> {} {}".format(feed.fetchable_url, feed.url) }}
+{{ "=> {} {}".format(feed.url, feed.url) }}
{% endfor %}
{% include 'fragments/footer.gmi' %}
diff --git a/serve/templates/newest_pages.gmi b/serve/templates/newest_pages.gmi
@@ -6,7 +6,7 @@
Here are the fifty most recently discovered Gemini pages by geminispace.info.
{% for page in newest_pages %}
-{{ "=> {} {}: {}".format(page.fetchable_url, page.first_seen_at.strftime('%Y-%m-%d'), page.url) }}
+{{ "=> {} {}: {}".format(page.url, page.first_seen_at.strftime('%Y-%m-%d'), page.url) }}
{% endfor %}
{% include 'fragments/footer.gmi' %}
diff --git a/serve/templates/search.gmi b/serve/templates/search.gmi
@@ -12,7 +12,7 @@
"{{ query }}"
{% for result in results %}
-=> {{ result["fetchable_url"] }} {{ result["link_text"] }}
+=> {{ result["url"] }} {{ result["link_text"] }}
{% if result["backlink_count"] > 1 %}
=> /backlinks?{{ result["url"][9:] | urlencode }} {{ result["backlink_count"] }} cross-capsule backlinks
{% elif result["backlink_count"] > 0 %}
diff --git a/serve/templates/threads.gmi b/serve/templates/threads.gmi
@@ -1,27 +0,0 @@
-{% include 'fragments/header.gmi' %}
-
-
-## Threads
-
-{% if sort == "recency" %}
-### Sort
-Most recent (current)
-=> /threads?length Switch to longest
-{% elif sort == "length" %}
-### Sort
-Longest (current)
-=> /threads?recency Switch to most recent
-{% endif %}
-
-{% for date in threads %}
-### {{ date["date"] | datetimeformat("%Y, %b %d") }}
-
-{% for thread in date["threads"] %}
-{% for member in thread["members"] %}
-=> {{ member["fetchable_url"] }} [{{ member["first_seen"] | datetimeformat("%b %d") }}] {{member["address"] | threadaddressformat }} {{ member["friendly_author"] }} - {{ member["friendly_title"] }}
-{% endfor %}
-
-{% endfor %}
-
-{% endfor %}
-{% include 'fragments/footer.gmi' %}
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -43,23 +43,25 @@ text
assert resources[1].raw_url == "other"
@pytest.mark.parametrize("test_input,expected_result", [
- (["gemini://gus.guru", None, None], [True, "gemini://gus.guru", "gus.guru", "gemini://gus.guru/"]),
+ (["gemini://gus.guru", None, None], [True, "gemini://gus.guru/", "gus.guru", "gemini://gus.guru/"]),
(["/bar", "gemini://gus.guru/foo", None], [False, None, None, None]),
(["/bar", "gemini://gus.guru/foo/", None], [False, None, None, None]),
(["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]),
(["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]),
(["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gemini://gus.guru/bar", "gus.guru", "gemini://gus.guru/bar"]),
- (["bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/foo/bar", "gus.guru", "gemini://gus.guru/foo/bar"]),
- (["//foo.com", None, None], [True, "gemini://foo.com", "foo.com", "gemini://foo.com/"]),
+ (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gemini://gus.guru/foo/bar/", "gus.guru", "gemini://gus.guru/foo/bar/"]),
+ (["//foo.com", None, None], [True, "gemini://foo.com/", "foo.com", "gemini://foo.com/"]),
(["gemini://gem.splatt9990.com/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"] ),
- (["gemini://gem.splatt9990.com:1965/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com:1965/index.gmi"]),
+ (["gemini://gem.splatt9990.com:1965/index.gmi", None, None], [True, "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi"]),
(["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gemini://gem.splatt9990.com:1966/index.gmi", "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi"]),
+ (["gemini://michaelnordmeyer.com", None, None], [True, "gemini://michaelnordmeyer.com/", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/"]),
+ (["log.gmi", "gemini://michaelnordmeyer.com/", None], [True, "gemini://michaelnordmeyer.com/log.gmi", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi"]),
+ (["Log.gmi", "gemini://michaelnordmeyer.com/", None], [True, "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi"]),
])
def test_url_parsing(self, test_input, expected_result):
gr = GeminiResource(test_input[0], test_input[1], test_input[2])
assert gr.is_valid == expected_result[0]
- assert gr.normalized_url == expected_result[1]
assert gr.normalized_host == expected_result[2]
assert gr.fetchable_url == expected_result[3]