commit 7ce66303c32c12552db2dedc7e0ab536f85e0915
parent 87d92bbfb37a77d905812d036b79c6f717c5137a
Author: René Wagner <rwa@clttr.info>
Date: Tue, 13 Jul 2021 09:21:06 +0200
remove raw data from excluded capsules
first part of #24
Diffstat:
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -517,6 +517,15 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
global max_crawl_depth
max_crawl_depth = 500
+ for excluded_prefix in EXCLUDED_URL_PREFIXES:
+ q = Page.delete().where(Page.url.startswith(excluded_prefix))
+ try:
+ count = q.execute()
+ if count > 0:
+ logging.info("Deleted %d rows for excluded URI %s", count, excluded_prefix)
+ except Exception as e:
+ logging.error("Failed to delete rows for %s: %s", excluded_prefix, e)
+
global failure_count
failure_count = {}
expired_resources = [GeminiResource(url) for url in load_expired_urls()]
diff --git a/gus/excludes.py b/gus/excludes.py
@@ -140,7 +140,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gemini.cabestan.tk/hn",
"gemini://hn.filiuspatris.net/",
"gemini://schmittstefan.de/de/nachrichten/",
- "gemini://gmi.noulin.net/mobile"
+ "gemini://gmi.noulin.net/mobile",
# wikipedia proxy
"gemini://wp.pitr.ca/",
"gemini://wp.glv.one/",