commit 3f7c0f84f9d039d4559225c7e2e97585c7fd8bc0
parent 8b004af54d160a78fdb6d261d4e2478483d2c082
Author: René Wagner <rwa@clttr.info>
Date: Thu, 27 May 2021 15:24:13 +0200
fix wrong embedding of excludes
Diffstat:
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -120,11 +120,15 @@ GROUP BY p.normalized_url""", constants.MAXIMUM_TEXT_PAGE_SIZE
for page in pages.iterator():
index_page(index, page, indexed_urls)
+ try:
+ index.close()
+ except Exception as e:
+ logging.error('Closing of inde failed: %s', e);
+
index_statistics = compute_index_statistics(db)
log_index_statistics(index_statistics)
persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv")
- index.close()
logging.info("Finished!")
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -228,15 +228,15 @@ def index_content(resource, response):
def should_skip(resource):
should_skip = False
- for excluded_prefix in excludes.EXCLUDED_URL_PREFIXES:
+ for excluded_prefix in EXCLUDED_URL_PREFIXES:
if resource.normalized_url.startswith(excluded_prefix):
should_skip = True
break
- for excluded_path in excludes.EXCLUDED_URL_PATHS:
+ for excluded_path in EXCLUDED_URL_PATHS:
if resource.urlsplit.path.lower().endswith(excluded_path):
should_skip = True
break
- m = excludes.EXCLUDED_URL_PATTERN.match(resource.normalized_url)
+ m = EXCLUDED_URL_PATTERN.match(resource.normalized_url)
if m:
should_skip = True
return should_skip