commit 3cdc5efc5a5e8015b2f3066a06df3c590884db89
parent a61c96151612747bbcbe6e72380ad792621a5ce0
Author: René Wagner <rwa@clttr.info>
Date: Wed, 29 Dec 2021 10:59:46 +0100
don't delete excluded pages from the pages table
Or we loose external backlinks to this pages as well
which might be usefull.
Diffstat:
1 file changed, 0 insertions(+), 10 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -539,16 +539,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
global max_crawl_depth
max_crawl_depth = 500
- # delete pages from excludes
- for excluded_prefix in EXCLUDED_URL_PREFIXES:
- q = Page.delete().where(Page.url.startswith(excluded_prefix))
- try:
- count = q.execute()
- if count > 0:
- logging.warn("Deleted %d rows for excluded URI %s", count, excluded_prefix)
- except Exception as e:
- logging.error("Failed to delete rows for %s: %s", excluded_prefix, e)
-
# delete pages that never successfull crawled
q = Page.delete().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False))
try: