geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit e8d416471813553b1e75ebaefe9bf8ea0dfa9b33
parent 7fa7a7d0fa5923ca93b7b22640228febf4faa6cf
Author: René Wagner <rwa@clttr.info>
Date:   Sat,  4 Sep 2021 09:03:14 +0200

do not add every single domain to the statistics file

Diffstat:
Mgus/excludes.py | 1-
Mgus/lib/index_statistics.py | 10+++-------
Mserve/models.py | 13+++++++++++++
Mserve/templates/known_hosts.gmi | 2+-
Mserve/views.py | 3+--
5 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/gus/excludes.py b/gus/excludes.py @@ -175,7 +175,6 @@ EXCLUDED_URL_PREFIXES = [ # docs - not our business "gemini://cfdocs.wetterberg.nu/", "gemini://godocs.io", - "gemini://hellomouse.net/user-pages/handicraftsman/ietf/", ] EXCLUDED_URL_PATHS = [ diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -48,7 +48,7 @@ ORDER BY 2 desc""").dicts()) "domain_count": domain_count, "content_type_frequencies": content_type_frequencies, "charset_frequencies": charset_frequencies, - "domains": sorted(domains), + "domains": "", } @@ -68,9 +68,6 @@ def log_index_statistics(index_statistics, crawl_statistics=None): logging.info('Number of broken URLs encountered while crawling: %d', crawl_statistics['broken_url_count']) - logging.debug('Domains indexed: %s', - index_statistics['domains']) - for entry in index_statistics['content_type_frequencies']: logging.info('Number of type "%s" resources indexed: %s', entry['content_type'], entry['count']) @@ -94,7 +91,7 @@ def serialize_statistics_line(index_statistics, crawl_statistics, was_destructiv crawl_statistics["redirect_count"] if crawl_statistics else 0, crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0, crawl_statistics["broken_url_count"] if crawl_statistics else 0, - "|".join(index_statistics["domains"]), + "", "|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]), "|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]), ) @@ -127,7 +124,6 @@ def deserialize_statistics_line(line): redirect_count = line_parts[4] redirect_nontrivial_count = line_parts[5] broken_url_count = line_parts[6] - domains = [domain for domain in line_parts[7].split("|")] content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")] charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")] @@ -138,7 +134,7 @@ def deserialize_statistics_line(line): "redirect_count": redirect_count, "redirect_nontrivial_count": redirect_nontrivial_count, "broken_url_count": broken_url_count, - "domains": domains, + "domains": "", "content_type_frequencies": content_type_frequencies, "charset_frequencies": charset_frequencies, } diff --git a/serve/models.py b/serve/models.py @@ -97,6 +97,18 @@ OR p.content_type IN ('application/atom+xml', 'application/rss+xml')) AND p.last_crawl_success_at IS NOT NULL""" ) return feeds_query.execute() + + def get_hosts(self): + hosts_query = Page.raw( + """ + SELECT DISTINCT p.domain + FROM page AS p + WHERE last_crawl_success_at IS NOT NULL + AND last_status = 20 + ORDER BY p.domain + """ + ) + return hosts_query.execute() def get_newest_hosts(self): newest_hosts_query = Page.raw( @@ -105,6 +117,7 @@ AND p.last_crawl_success_at IS NOT NULL""" FROM page AS p WHERE last_crawl_success_at IS NOT NULL AND first_seen_at IS NOT NULL + AND last_status = 20 GROUP BY p.domain ORDER BY first_seen_at DESC LIMIT 10 diff --git a/serve/templates/known_hosts.gmi b/serve/templates/known_hosts.gmi @@ -9,7 +9,7 @@ Below are the hosts in Geminispace of which geminispace.info is aware. Note that {% endif %} {% for host in known_hosts %} -{{ "=> gemini://{} {}".format(host, host) }} +{{ "=> gemini://{} {}".format(host.domain, host.domain) }} {% endfor %} {% include 'fragments/footer.gmi' %} diff --git a/serve/views.py b/serve/views.py @@ -109,8 +109,7 @@ def statistics(request): def known_hosts(request): body = render_template( "known_hosts.gmi", - # TODO: remove this `sorted` after the next index generation - known_hosts=sorted(gus.statistics["domains"]), + known_hosts=gus.get_hosts(), index_modification_time=gus.statistics["index_modification_time"] ) return Response(Status.SUCCESS, "text/gemini", body)