commit 6871082e39a76cc1ac8a641f50572b318cf0440e
parent 99eadfccc51fc7bd71b66c0966f2e2e1cb6a7694
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 16 Nov 2020 08:44:31 -0500
Take exclusions into account when generating statistics
This will ensure accuracy of the statistics - it's relatively common
that index-excluded content ends up in the database, so this will make
sure the db-based calculations are generally more harmonious with the
index-based calculations/searches.
Note that it's not perfect, since I didn't address the calculations by
content_type/charset/etc. Those are a bit trickier to fix, so I will
have to think a bit more about the best way to deal with that. I
suspect it might warrant of bit of rearchitecting how exclusions work
generally. One idea I currently have for that is to keep the exclusion
list in the database, instead of in code like it currently is - that
would allow for inner joining against an exclusion table in db
queries, which would be really convenient.
Also, this commit removes the superfluous query for getting
domain_count - it's more performant just to count the list of domains
that were already constructed from the previous query.
Diffstat:
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -2,6 +2,8 @@ import logging
from datetime import datetime
from peewee import fn, SQL
+
+from gus.crawl import EXCLUDED_URL_PREFIXES
from gus.lib.db_model import Page, Crawl
@@ -23,12 +25,15 @@ GROUP BY p.normalized_url""")
s = d.domain
if d.port != 1965:
s = f"{d.domain}:{d.port}"
+ should_skip = False
+ for excluded_prefix in EXCLUDED_URL_PREFIXES:
+ if f"gemini://{s}".startswith(excluded_prefix):
+ should_skip = True
+ break
+ if should_skip:
+ continue
domains.append(s)
- domain_count = (Page
- .select(Page.domain, Page.port)
- .where(Page.id.in_(valid_page_ids))
- .distinct()
- .count())
+ domain_count = len(domains)
content_type_frequencies = (Page
.select(Page.content_type, fn.Count(Page.content_type).alias("count"))
.where(Page.id.in_(valid_page_ids))