geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 3f7c0f84f9d039d4559225c7e2e97585c7fd8bc0
parent 8b004af54d160a78fdb6d261d4e2478483d2c082
Author: René Wagner <rwa@clttr.info>
Date:   Thu, 27 May 2021 15:24:13 +0200

fix wrong embedding of excludes

Diffstat:
Mgus/build_index.py | 6+++++-
Mgus/crawl.py | 6+++---
2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -120,11 +120,15 @@ GROUP BY p.normalized_url""", constants.MAXIMUM_TEXT_PAGE_SIZE for page in pages.iterator(): index_page(index, page, indexed_urls) + try: + index.close() + except Exception as e: + logging.error('Closing of inde failed: %s', e); + index_statistics = compute_index_statistics(db) log_index_statistics(index_statistics) persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv") - index.close() logging.info("Finished!") diff --git a/gus/crawl.py b/gus/crawl.py @@ -228,15 +228,15 @@ def index_content(resource, response): def should_skip(resource): should_skip = False - for excluded_prefix in excludes.EXCLUDED_URL_PREFIXES: + for excluded_prefix in EXCLUDED_URL_PREFIXES: if resource.normalized_url.startswith(excluded_prefix): should_skip = True break - for excluded_path in excludes.EXCLUDED_URL_PATHS: + for excluded_path in EXCLUDED_URL_PATHS: if resource.urlsplit.path.lower().endswith(excluded_path): should_skip = True break - m = excludes.EXCLUDED_URL_PATTERN.match(resource.normalized_url) + m = EXCLUDED_URL_PATTERN.match(resource.normalized_url) if m: should_skip = True return should_skip