geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 35ebd8b71ef112145b3ebc6604f8c2e4dd0365af
parent faf87301d736167b8b8e6f14cf4fcf2e0308b5cc
Author: René Wagner <rwa@clttr.info>
Date:   Sun,  4 Jun 2023 19:41:54 +0200

fix some issues when deleting outdated pages

Diffstat:
Mgus/build_index.py | 39++++++++++++++++++---------------------
1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -92,19 +92,30 @@ def build_index(should_run_destructive=False): # delete pages that never successfull crawled count=0 - q = Page.select(Page.url).where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) + q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) for page in q.iterator(): try: index.delete_by_term("url_id", page.url) - page.delete_instance() - count+=1 + count += page.delete_instance() except Exception as e: - logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e) + logging.error("Failed to delete row %s with outdated successful crawl: %s", page.url, e) logging.warn("Deleted %d rows without successfull crawl", count) - # delete entry domain that has no page with a recent successfull crawl + # delete pages with last crawl success older than 30 days which have been recrawled since than + # this avoids deletion of files that have a change_frequency longer than our timeout + count=0 + q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) + for page in q.iterator(): + try: + index.delete_by_term("url_id", page.url) + count += page.delete_instance() + except Exception as e: + logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e) + logging.warn("Deleted %d rows with outdated successful crawl", count) + + # delete entire domain that has no page with a recent successfull crawl last_valid_timestamp = datetime.now() - timedelta(days = 30) - outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).group_by(Page.domain) + outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at.is_null(False) & Page.last_crawl_success_at.is_null(False)).group_by(Page.domain) domains = outdated_domains_query.execute() for del_domain in domains: try: @@ -117,21 +128,7 @@ def build_index(should_run_destructive=False): index.delete_by_term("url_id", outdated_page.url) outdated_page.delete_instance() except Exception as e: - logging.error("Failed to delete domains with outdated successful crawl: %s", e) - - # delete pages with last crawl success older than 30 days which have been recrawled since than - # this avoids deletion of files that have a change_frequency longer than our timeout - count=0 - q = Page.select(Page.url).where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) - for page in q.iterator(): - try: - index.delete_by_term("url_id", page.url) - page.delete_instance() - count+=1 - except Exception as e: - logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e) - - logging.warn("Deleted %d rows with outdated successful crawl", count) + logging.error("Failed to delete domain %s (last crawl: %s, last crawl success: %s) with outdated successful crawl: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at, e) if (should_run_destructive): pages = Page.raw(