commit e8d416471813553b1e75ebaefe9bf8ea0dfa9b33
parent 7fa7a7d0fa5923ca93b7b22640228febf4faa6cf
Author: René Wagner <rwa@clttr.info>
Date: Sat, 4 Sep 2021 09:03:14 +0200
do not add every single domain to the statistics file
Diffstat:
5 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -175,7 +175,6 @@ EXCLUDED_URL_PREFIXES = [
# docs - not our business
"gemini://cfdocs.wetterberg.nu/",
"gemini://godocs.io",
- "gemini://hellomouse.net/user-pages/handicraftsman/ietf/",
]
EXCLUDED_URL_PATHS = [
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -48,7 +48,7 @@ ORDER BY 2 desc""").dicts())
"domain_count": domain_count,
"content_type_frequencies": content_type_frequencies,
"charset_frequencies": charset_frequencies,
- "domains": sorted(domains),
+ "domains": "",
}
@@ -68,9 +68,6 @@ def log_index_statistics(index_statistics, crawl_statistics=None):
logging.info('Number of broken URLs encountered while crawling: %d',
crawl_statistics['broken_url_count'])
- logging.debug('Domains indexed: %s',
- index_statistics['domains'])
-
for entry in index_statistics['content_type_frequencies']:
logging.info('Number of type "%s" resources indexed: %s',
entry['content_type'], entry['count'])
@@ -94,7 +91,7 @@ def serialize_statistics_line(index_statistics, crawl_statistics, was_destructiv
crawl_statistics["redirect_count"] if crawl_statistics else 0,
crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0,
crawl_statistics["broken_url_count"] if crawl_statistics else 0,
- "|".join(index_statistics["domains"]),
+ "",
"|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]),
"|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]),
)
@@ -127,7 +124,6 @@ def deserialize_statistics_line(line):
redirect_count = line_parts[4]
redirect_nontrivial_count = line_parts[5]
broken_url_count = line_parts[6]
- domains = [domain for domain in line_parts[7].split("|")]
content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")]
charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")]
@@ -138,7 +134,7 @@ def deserialize_statistics_line(line):
"redirect_count": redirect_count,
"redirect_nontrivial_count": redirect_nontrivial_count,
"broken_url_count": broken_url_count,
- "domains": domains,
+ "domains": "",
"content_type_frequencies": content_type_frequencies,
"charset_frequencies": charset_frequencies,
}
diff --git a/serve/models.py b/serve/models.py
@@ -97,6 +97,18 @@ OR p.content_type IN ('application/atom+xml', 'application/rss+xml'))
AND p.last_crawl_success_at IS NOT NULL"""
)
return feeds_query.execute()
+
+ def get_hosts(self):
+ hosts_query = Page.raw(
+ """
+ SELECT DISTINCT p.domain
+ FROM page AS p
+ WHERE last_crawl_success_at IS NOT NULL
+ AND last_status = 20
+ ORDER BY p.domain
+ """
+ )
+ return hosts_query.execute()
def get_newest_hosts(self):
newest_hosts_query = Page.raw(
@@ -105,6 +117,7 @@ AND p.last_crawl_success_at IS NOT NULL"""
FROM page AS p
WHERE last_crawl_success_at IS NOT NULL
AND first_seen_at IS NOT NULL
+ AND last_status = 20
GROUP BY p.domain
ORDER BY first_seen_at DESC
LIMIT 10
diff --git a/serve/templates/known_hosts.gmi b/serve/templates/known_hosts.gmi
@@ -9,7 +9,7 @@ Below are the hosts in Geminispace of which geminispace.info is aware. Note that
{% endif %}
{% for host in known_hosts %}
-{{ "=> gemini://{} {}".format(host, host) }}
+{{ "=> gemini://{} {}".format(host.domain, host.domain) }}
{% endfor %}
{% include 'fragments/footer.gmi' %}
diff --git a/serve/views.py b/serve/views.py
@@ -109,8 +109,7 @@ def statistics(request):
def known_hosts(request):
body = render_template(
"known_hosts.gmi",
- # TODO: remove this `sorted` after the next index generation
- known_hosts=sorted(gus.statistics["domains"]),
+ known_hosts=gus.get_hosts(),
index_modification_time=gus.statistics["index_modification_time"]
)
return Response(Status.SUCCESS, "text/gemini", body)