geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit e4c042c330023335eeded1d140db47f0f9727cb8
parent 1fedfc3bc57d434f3cddefcbad0a464285c35250
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 21 May 2020 07:33:36 -0400

[crawl] Make path exclusions more robust

Diffstat:
Mgus/crawl.py | 8++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -100,9 +100,9 @@ EXCLUDED_URL_PREFIXES = [ ] EXCLUDED_URL_PATHS = [ - "/atom.xml", - "/robots.txt", - "/rss.txt", + "atom.xml", + "robots.txt", + "rss.txt", ] @@ -215,7 +215,7 @@ def crawl(gemini_resource): print("--------------------------") return for excluded_path in EXCLUDED_URL_PATHS: - if gr.urlsplit.path.lower() == excluded_path: + if gr.urlsplit.path.lower().endswith(excluded_path): print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url) print("--------------------------") return