geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 3cdc5efc5a5e8015b2f3066a06df3c590884db89
parent a61c96151612747bbcbe6e72380ad792621a5ce0
Author: René Wagner <rwa@clttr.info>
Date:   Wed, 29 Dec 2021 10:59:46 +0100

don't delete excluded pages from the pages table

Or we loose external backlinks to this pages as well
which might be usefull.

Diffstat:
Mgus/crawl.py | 10----------
1 file changed, 0 insertions(+), 10 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -539,16 +539,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): global max_crawl_depth max_crawl_depth = 500 - # delete pages from excludes - for excluded_prefix in EXCLUDED_URL_PREFIXES: - q = Page.delete().where(Page.url.startswith(excluded_prefix)) - try: - count = q.execute() - if count > 0: - logging.warn("Deleted %d rows for excluded URI %s", count, excluded_prefix) - except Exception as e: - logging.error("Failed to delete rows for %s: %s", excluded_prefix, e) - # delete pages that never successfull crawled q = Page.delete().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) try: