geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit d012217757ba8dfb3f2e6dff0dbdc79aa867ca4e
parent b19d7e8a4c125c7380935bec3384eb8fc53a8ec1
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 19 Jul 2020 08:18:31 -0400

[crawl] Ensure manual exclusions stay out of the database

Diffstat:
Mgus/crawl.py | 7+++++++
1 file changed, 7 insertions(+), 0 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -234,6 +234,13 @@ def index_links(from_resource, contained_resources): from_page, created = Page.get_or_create(url=from_resource.indexable_url) data = [] for cr in contained_resources: + should_skip = False + for excluded_prefix in EXCLUDED_URL_PREFIXES: + if cr.normalized_url.startswith(excluded_prefix): + should_skip = True + break + if should_skip: + continue to_page = Page.get_or_none(url=cr.indexable_url) if not to_page: to_page = Page.create(