commit 35ebd8b71ef112145b3ebc6604f8c2e4dd0365af
parent faf87301d736167b8b8e6f14cf4fcf2e0308b5cc
Author: René Wagner <rwa@clttr.info>
Date: Sun, 4 Jun 2023 19:41:54 +0200
fix some issues when deleting outdated pages
Diffstat:
1 file changed, 18 insertions(+), 21 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -92,19 +92,30 @@ def build_index(should_run_destructive=False):
# delete pages that never successfull crawled
count=0
- q = Page.select(Page.url).where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False))
+ q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False))
for page in q.iterator():
try:
index.delete_by_term("url_id", page.url)
- page.delete_instance()
- count+=1
+ count += page.delete_instance()
except Exception as e:
- logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e)
+ logging.error("Failed to delete row %s with outdated successful crawl: %s", page.url, e)
logging.warn("Deleted %d rows without successfull crawl", count)
- # delete entry domain that has no page with a recent successfull crawl
+ # delete pages with last crawl success older than 30 days which have been recrawled since than
+ # this avoids deletion of files that have a change_frequency longer than our timeout
+ count=0
+ q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
+ for page in q.iterator():
+ try:
+ index.delete_by_term("url_id", page.url)
+ count += page.delete_instance()
+ except Exception as e:
+ logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e)
+ logging.warn("Deleted %d rows with outdated successful crawl", count)
+
+ # delete entire domain that has no page with a recent successfull crawl
last_valid_timestamp = datetime.now() - timedelta(days = 30)
- outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).group_by(Page.domain)
+ outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at.is_null(False) & Page.last_crawl_success_at.is_null(False)).group_by(Page.domain)
domains = outdated_domains_query.execute()
for del_domain in domains:
try:
@@ -117,21 +128,7 @@ def build_index(should_run_destructive=False):
index.delete_by_term("url_id", outdated_page.url)
outdated_page.delete_instance()
except Exception as e:
- logging.error("Failed to delete domains with outdated successful crawl: %s", e)
-
- # delete pages with last crawl success older than 30 days which have been recrawled since than
- # this avoids deletion of files that have a change_frequency longer than our timeout
- count=0
- q = Page.select(Page.url).where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
- for page in q.iterator():
- try:
- index.delete_by_term("url_id", page.url)
- page.delete_instance()
- count+=1
- except Exception as e:
- logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e)
-
- logging.warn("Deleted %d rows with outdated successful crawl", count)
+ logging.error("Failed to delete domain %s (last crawl: %s, last crawl success: %s) with outdated successful crawl: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at, e)
if (should_run_destructive):
pages = Page.raw(