commit e4c042c330023335eeded1d140db47f0f9727cb8
parent 1fedfc3bc57d434f3cddefcbad0a464285c35250
Author: Natalie Pendragon <natpen@natpen.net>
Date: Thu, 21 May 2020 07:33:36 -0400
[crawl] Make path exclusions more robust
Diffstat:
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -100,9 +100,9 @@ EXCLUDED_URL_PREFIXES = [
]
EXCLUDED_URL_PATHS = [
- "/atom.xml",
- "/robots.txt",
- "/rss.txt",
+ "atom.xml",
+ "robots.txt",
+ "rss.txt",
]
@@ -215,7 +215,7 @@ def crawl(gemini_resource):
print("--------------------------")
return
for excluded_path in EXCLUDED_URL_PATHS:
- if gr.urlsplit.path.lower() == excluded_path:
+ if gr.urlsplit.path.lower().endswith(excluded_path):
print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url)
print("--------------------------")
return