geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 83fe8ef7ac639c7c48d02dd4ff8caa30379f6bfa
parent 84c79072c9ed0f803755cfa337936bbd4e35695a
Author: René Wagner <rwa@clttr.info>
Date:   Tue, 23 Aug 2022 16:47:26 +0200

properly implement deletion of capsules with outdated crawls

We now look at the whole domain and check if any page of this
domain has been successfully crawled in the last 30 days.
If not, all pages of the domain are removed from raw data and
search index.

closes #35

Diffstat:
Mgus/build_index.py | 27++++++++++++++++-----------
Minfra/rebuild_index.sh | 19++++++++++++-------
2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -3,11 +3,11 @@ import logging from datetime import datetime, timedelta from urllib.parse import uses_relative, uses_netloc +from peewee import fn from . import constants from gus.crawl import should_skip from gus.excludes import EXCLUDED_URL_PREFIXES -from . import constants from gus.lib.db_model import init_db, Page from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import ( @@ -101,14 +101,21 @@ def build_index(should_run_destructive=False): # delete pages with last crawl success older than 30 days which have been recrawled since than # this avoids deletion of files that have a change_frequency longer than our timeout - #q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) - #try: - # domains = q.execute() - # for del_domain in domains: - # logging.warn("Deleting pages for domain: %s", del_domain.domain) - # # Page.delete().where(Page.domain = domain) - #except Exception as e: - # logging.error("Failed to delete domains with outdated successful crawl: %s", e) + last_valid_timestamp = datetime.now() - timedelta(days = 30) + outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at > Page.last_crawl_success_at).group_by(Page.domain) + domains = outdated_domains_query.execute() + for del_domain in domains: + try: + if (del_domain.last_crawl_success_at < last_valid_timestamp): + logging.warn("Deleting pages for domain: %s, last crawl: %s, last crawl success: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at) + outdated_pages_query = Page.select(Page.url).where(Page.domain == del_domain.domain) + for outdated_page in outdated_pages_query.iterator(): + # we need to delete every single page as "delete_by_term" does not work on + # fields that parsed by a stemmer like "domain" as a text field is + index.delete_by_term("url_id", outdated_page.url) + outdated_page.delete_instance() + except Exception as e: + logging.error("Failed to delete domains with outdated successful crawl: %s", e) # delete pages with last crawl success older than 30 days which have been recrawled since than # this avoids deletion of files that have a change_frequency longer than our timeout @@ -149,8 +156,6 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA try: logging.info("Commiting search index...") index.close() - logging.info("Updating raw data...") - except Exception as e: logging.error('Closing of index failed: %s', e); diff --git a/infra/rebuild_index.sh b/infra/rebuild_index.sh @@ -1,7 +1,12 @@ -sudo systemctl stop gus -cp -r /home/gus/index /home/gus/index.new -sudo systemctl start gus -/home/gus/.poetry/bin/poetry run build_index -d -rm -rf /home/gus/index.old -mv /home/gus/index /home/gus/index.old -mv /home/gus/index.new /home/gus/index +if [ `date +%d` == "01" ] +then + /home/gus/.poetry/bin/poetry run build_index +else + sudo systemctl stop gusi + cp -r /home/gus/index /home/gus/index.new + sudo systemctl start gus + /home/gus/.poetry/bin/poetry run build_index -d + rm -rf /home/gus/index.old + mv /home/gus/index /home/gus/index.old + mv /home/gus/index.new /home/gus/index +fi