commit ffbf174790574623380b494ca921f385fb464340
parent cbe22de43a084b7b2c288d5867cf24bbb0ce8414
Author: René Wagner <rwa@clttr.info>
Date: Fri, 23 Jul 2021 13:11:09 +0200
implemented deletion of outdated data
- pages that never had any successfull crawl
- pages with the last successfull crawl more than 30 days ago
closes #24
Diffstat:
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -546,10 +546,26 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
try:
count = q.execute()
if count > 0:
- logging.info("Deleted %d rows for excluded URI %s", count, excluded_prefix)
+ logging.warn("Deleted %d rows for excluded URI %s", count, excluded_prefix)
except Exception as e:
logging.error("Failed to delete rows for %s: %s", excluded_prefix, e)
+ q = Page.delete().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False))
+ try:
+ count = q.execute()
+ if count > 0:
+ logging.warn("Deleted %d rows without successfull crawl", count)
+ except Exception as e:
+ logging.error("Failed to delete rows without successfull crawl: %s", e)
+
+ q = Page.delete().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
+ try:
+ count = q.execute()
+ if count > 0:
+ logging.warn("Deleted %d rows with outdated successful crawl", count)
+ except Exception as e:
+ logging.error("Failed to delete rows with outdated successful crawl: %s", e)
+
global failure_count
failure_count = {}
expired_resources = [GeminiResource(url) for url in load_expired_urls()]