commit d012217757ba8dfb3f2e6dff0dbdc79aa867ca4e
parent b19d7e8a4c125c7380935bec3384eb8fc53a8ec1
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sun, 19 Jul 2020 08:18:31 -0400
[crawl] Ensure manual exclusions stay out of the database
Diffstat:
1 file changed, 7 insertions(+), 0 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -234,6 +234,13 @@ def index_links(from_resource, contained_resources):
from_page, created = Page.get_or_create(url=from_resource.indexable_url)
data = []
for cr in contained_resources:
+ should_skip = False
+ for excluded_prefix in EXCLUDED_URL_PREFIXES:
+ if cr.normalized_url.startswith(excluded_prefix):
+ should_skip = True
+ break
+ if should_skip:
+ continue
to_page = Page.get_or_none(url=cr.indexable_url)
if not to_page:
to_page = Page.create(