commit 085471dc9ab17d935871867cdbc717c6b2d1b8fb
parent c7d06895353747e61ab72c85f2f3446abbed4919
Author: René Wagner <rwa@clttr.info>
Date: Tue, 18 Apr 2023 20:07:24 +0200
fix page deletion
Diffstat:
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -100,17 +100,15 @@ def build_index(should_run_destructive=False):
count+=1
except Exception as e:
logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e)
-
logging.warn("Deleted %d rows without successfull crawl", count)
- # delete pages with last crawl success older than 30 days which have been recrawled since than
- # this avoids deletion of files that have a change_frequency longer than our timeout
+ # delete entry domain that has no page with a recent successfull crawl
last_valid_timestamp = datetime.now() - timedelta(days = 30)
- outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at > Page.last_crawl_success_at).group_by(Page.domain)
+ outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).group_by(Page.domain)
domains = outdated_domains_query.execute()
for del_domain in domains:
try:
- if (del_domain.last_crawl_success_at < last_valid_timestamp):
+ if (del_domain.last_crawl_success_at < last_valid_timestamp and del_domain.last_crawl_at > del_domain.last_crawl_success_at):
logging.warn("Deleting pages for domain: %s, last crawl: %s, last crawl success: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at)
outdated_pages_query = Page.select(Page.url).where(Page.domain == del_domain.domain)
for outdated_page in outdated_pages_query.iterator():