commit cfaf86dc24b3390a2a58d4c2c1cb9519abca67ef
parent 16eb249aa8f40774ef3613644ac8d7193d08f925
Author: René Wagner <rwa@clttr.info>
Date: Sat, 11 Feb 2023 10:37:28 +0100
show list of excluded uris
Diffstat:
4 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/serve/templates/documentation/filters.gmi b/serve/templates/documentation/filters.gmi
@@ -0,0 +1,14 @@
+{% include 'fragments/header.gmi' %}
+
+
+## Filtered URIs
+
+Below is a list of filters that admins decided to put in place.
+URIs that match that filter will not be crawled and indexed by geminispace.info
+
+{% for filter in filters %}
+{{ "* {}".format(filter) }}
+{% endfor %}
+
+=> indexing back to documentation
+{% include 'fragments/footer.gmi' %}
diff --git a/serve/templates/documentation/indexing.gmi b/serve/templates/documentation/indexing.gmi
@@ -16,6 +16,7 @@ geminispace.info does not crawl capsules behind Onion links.
Textual pages over 10 MB in size will not be indexed.
Please note that there are provisions in place for manually excluding content from indexing, which maintainers will typically use to exclude pages and domains that cause issues with index relevance or crawl success. GUS ends up crawling weird protocol experiments, proofs of concepts, and whatever other bizarre bits of technical creativity folks put up in Geminispace, so it is a continual effort to keep the index healthy. Please don't take it personally if your content ends up excluded, and I promise we are continually working to make GUS indexing more resilient and scalable!
+=> filters list of filtered URIs
Currently, especially content of the following types is excluded:
- mirrors of large websites like Wikipedia or the Go-docs (it's just to much to add it to the index in the current state)
diff --git a/serve/templates/news.gmi b/serve/templates/news.gmi
@@ -2,6 +2,10 @@
## News
+### 2023-02-10
+We now provide a list of URIs that are currently excluded from crawl & indexing. This should improve the transparency on what geminispace.info is doing. At the moment there is no reason given as to why a specific exclude is in place. We might add this in the feature.
+=> documentation/filters list of excluded URIs
+
### 2023-01-29 updated TLS certificate
geminispace.info now uses an updated certificate that uses X.509 Version 3.
I hope this improves compatibility with clients as the previously used X.590 v1 seems to move out of support in some omplementations.
@@ -35,7 +39,7 @@ At the moment i have no motivation to put the required efforts into option 1 and
### 2022-08-22 donations welcome
We've set up a way to send donations to help covering the ongoing costs of running geminispace.info
-=> ./about more information can be found on our About page
+=> about more information can be found on our About page
### 2022-08-18 duplicate results
Due to a small glitch in the crawler we had duplicate results in the dataset for a few weeks.
diff --git a/serve/views.py b/serve/views.py
@@ -15,6 +15,7 @@ from .models import (
GUS,
process_seed_request,
)
+from gus.excludes import EXCLUDED_URL_PREFIXES, EXCLUDED_URL_PATHS
TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")
@@ -182,6 +183,16 @@ def documentation_indexing(request):
return Response(Status.SUCCESS, "text/gemini", body)
+@app.route("/documentation/filters", strict_trailing_slash=False)
+def documentation_filters(request):
+ body = render_template(
+ "documentation/filters.gmi",
+ index_modification_time=gus.statistics["index_modification_time"],
+ filters=EXCLUDED_URL_PREFIXES
+ )
+ return Response(Status.SUCCESS, "text/gemini", body)
+
+
@app.route("/documentation/backlinks", strict_trailing_slash=False)
def documentation_backlinks(request):
body = render_template(