geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 085471dc9ab17d935871867cdbc717c6b2d1b8fb
parent c7d06895353747e61ab72c85f2f3446abbed4919
Author: René Wagner <rwa@clttr.info>
Date:   Tue, 18 Apr 2023 20:07:24 +0200

fix page deletion

Diffstat:
Mgus/build_index.py | 8+++-----
1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -100,17 +100,15 @@ def build_index(should_run_destructive=False): count+=1 except Exception as e: logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e) - logging.warn("Deleted %d rows without successfull crawl", count) - # delete pages with last crawl success older than 30 days which have been recrawled since than - # this avoids deletion of files that have a change_frequency longer than our timeout + # delete entry domain that has no page with a recent successfull crawl last_valid_timestamp = datetime.now() - timedelta(days = 30) - outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at > Page.last_crawl_success_at).group_by(Page.domain) + outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).group_by(Page.domain) domains = outdated_domains_query.execute() for del_domain in domains: try: - if (del_domain.last_crawl_success_at < last_valid_timestamp): + if (del_domain.last_crawl_success_at < last_valid_timestamp and del_domain.last_crawl_at > del_domain.last_crawl_success_at): logging.warn("Deleting pages for domain: %s, last crawl: %s, last crawl success: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at) outdated_pages_query = Page.select(Page.url).where(Page.domain == del_domain.domain) for outdated_page in outdated_pages_query.iterator():