geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 7ce66303c32c12552db2dedc7e0ab536f85e0915
parent 87d92bbfb37a77d905812d036b79c6f717c5137a
Author: René Wagner <rwa@clttr.info>
Date:   Tue, 13 Jul 2021 09:21:06 +0200

remove raw data from excluded capsules

first part of #24

Diffstat:
Mgus/crawl.py | 9+++++++++
Mgus/excludes.py | 2+-
2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -517,6 +517,15 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): global max_crawl_depth max_crawl_depth = 500 + for excluded_prefix in EXCLUDED_URL_PREFIXES: + q = Page.delete().where(Page.url.startswith(excluded_prefix)) + try: + count = q.execute() + if count > 0: + logging.info("Deleted %d rows for excluded URI %s", count, excluded_prefix) + except Exception as e: + logging.error("Failed to delete rows for %s: %s", excluded_prefix, e) + global failure_count failure_count = {} expired_resources = [GeminiResource(url) for url in load_expired_urls()] diff --git a/gus/excludes.py b/gus/excludes.py @@ -140,7 +140,7 @@ EXCLUDED_URL_PREFIXES = [ "gemini://gemini.cabestan.tk/hn", "gemini://hn.filiuspatris.net/", "gemini://schmittstefan.de/de/nachrichten/", - "gemini://gmi.noulin.net/mobile" + "gemini://gmi.noulin.net/mobile", # wikipedia proxy "gemini://wp.pitr.ca/", "gemini://wp.glv.one/",