commit e2ec756bbdcae99d21e7f7157f9e6fb306981dc3
parent ca95ee48b9c2bfb317202c2500e00b178c9b6666
Author: René Wagner <rwa@clttr.info>
Date: Thu, 26 May 2022 19:42:23 +0200
move data deletion to indexing
Diffstat:
3 files changed, 41 insertions(+), 36 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -86,6 +86,44 @@ def build_index(should_run_destructive=False):
db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
index = search.Index(index_dir, should_run_destructive)
+ # delete pages that never successfull crawled
+ count=0
+ q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False))
+ for page in q.iterator():
+ try:
+ index.delete_by_term("url_id", page.url)
+ page.delete_instance()
+ count+=1
+ except Exception as e:
+ logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e)
+
+ logging.warn("Deleted %d rows without successfull crawl", count)
+
+ # delete pages with last crawl success older than 30 days which have been recrawled since than
+ # this avoids deletion of files that have a change_frequency longer than our timeout
+ #q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
+ #try:
+ # domains = q.execute()
+ # for del_domain in domains:
+ # logging.warn("Deleting pages for domain: %s", del_domain.domain)
+ # # Page.delete().where(Page.domain = domain)
+ #except Exception as e:
+ # logging.error("Failed to delete domains with outdated successful crawl: %s", e)
+
+ # delete pages with last crawl success older than 30 days which have been recrawled since than
+ # this avoids deletion of files that have a change_frequency longer than our timeout
+ count=0
+ q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
+ for page in q.iterator():
+ try:
+ index.delete_by_term("url_id", page.url)
+ page.delete_instance()
+ count+=1
+ except Exception as e:
+ logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e)
+
+ logging.warn("Deleted %d rows with outdated successful crawl", count)
+
if (should_run_destructive):
pages = Page.raw(
"""SELECT p.* FROM page AS p
@@ -105,17 +143,14 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
for page in pages.iterator():
index_page(index, page)
+ page.indexed_at = datetime.utcnow()
+ page.save()
try:
logging.info("Commiting search index...")
index.close()
logging.info("Updating raw data...")
- timestamp = datetime.utcnow()
- for page in pages.iterator():
- page.indexed_at = timestamp;
- page.save()
-
except Exception as e:
logging.error('Closing of index failed: %s', e);
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -545,36 +545,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
global max_crawl_depth
max_crawl_depth = 500
- # delete pages that never successfull crawled
- q = Page.delete().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False))
- try:
- count = q.execute()
- if count > 0:
- logging.warn("Deleted %d rows without successfull crawl", count)
- except Exception as e:
- logging.error("Failed to delete rows without successfull crawl: %s", e)
-
- # delete pages with last crawl success older than 30 days which have been recrawled since than
- # this avoids deletion of files that have a change_frequency longer than our timeout
- #q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
- #try:
- # domains = q.execute()
- # for del_domain in domains:
- # logging.warn("Deleting pages for domain: %s", del_domain.domain)
- # # Page.delete().where(Page.domain = domain)
- #except Exception as e:
- # logging.error("Failed to delete domains with outdated successful crawl: %s", e)
-
- # delete pages with last crawl success older than 30 days which have been recrawled since than
- # this avoids deletion of files that have a change_frequency longer than our timeout
- q = Page.delete().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
- try:
- count = q.execute()
- if count > 0:
- logging.warn("Deleted %d rows with outdated successful crawl", count)
- except Exception as e:
- logging.error("Failed to delete rows with outdated successful crawl: %s", e)
-
global failure_count
failure_count = {}
diff --git a/gus/lib/search.py b/gus/lib/search.py
@@ -82,7 +82,7 @@ class Index:
def add_document(self, document):
self._rolling_writer().update_document(**document)
- def delete_by_term(self, key, val): # TODO delete_document
+ def delete_by_term(self, key, val):
self._rolling_writer().delete_by_term(key, val, searcher=None)
def parse_query(self, query):