geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit ffbf174790574623380b494ca921f385fb464340
parent cbe22de43a084b7b2c288d5867cf24bbb0ce8414
Author: René Wagner <rwa@clttr.info>
Date:   Fri, 23 Jul 2021 13:11:09 +0200

implemented deletion of outdated data

- pages that never had any successfull crawl
- pages with the last successfull crawl more than 30 days ago

closes #24

Diffstat:
Mgus/crawl.py | 18+++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -546,10 +546,26 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): try: count = q.execute() if count > 0: - logging.info("Deleted %d rows for excluded URI %s", count, excluded_prefix) + logging.warn("Deleted %d rows for excluded URI %s", count, excluded_prefix) except Exception as e: logging.error("Failed to delete rows for %s: %s", excluded_prefix, e) + q = Page.delete().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) + try: + count = q.execute() + if count > 0: + logging.warn("Deleted %d rows without successfull crawl", count) + except Exception as e: + logging.error("Failed to delete rows without successfull crawl: %s", e) + + q = Page.delete().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) + try: + count = q.execute() + if count > 0: + logging.warn("Deleted %d rows with outdated successful crawl", count) + except Exception as e: + logging.error("Failed to delete rows with outdated successful crawl: %s", e) + global failure_count failure_count = {} expired_resources = [GeminiResource(url) for url in load_expired_urls()]