commit 51729e59c9466cc940a141fddff740df45116028
parent 1a9a58abf2bb4e3fb3fc35b5ba6507289411f446
Author: Natalie Pendragon <natpen@natpen.net>
Date: Wed, 13 May 2020 09:51:06 -0400
[crawl] Refactor manual exclusions and add fgaz' calculator
The calculator seems to generate links dynamically, so attempting to
crawl it will yield unending pages with links to more deeply-nested
mathematical operations.
Diffstat:
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -52,6 +52,13 @@ SEED_URLS = [
"gemini://zaibatsu.circumlunar.space",
]
+EXCLUDED_URL_PREFIXES = [
+ "gemini://example.org",
+ "gemini://example.com",
+ "gemini://gemini.conman.org/test",
+ "gemini://gemini.circumlunar.space/users/fgaz/calculator/",
+]
+
def backup_old_index(index_dir, backup_dir):
last_index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir))
@@ -178,13 +185,15 @@ def crawl_url(url):
url = urlunparse(u)
path = u.path.lower().rstrip().rstrip('/')
normalized_url, normalized_host = normalize_gemini_url(url)
- if normalized_url is None or \
- normalized_url.startswith("gemini://example.org") or \
- normalized_url.startswith("gemini://example.com") or \
- normalized_url.startswith("gemini://gemini.conman.org/test"):
+ if normalized_url is None:
print("MANUAL EXCLUSION SKIP : %s" % url)
print("--------------------------")
return
+ for url_prefix in EXCLUDED_URL_PREFIXES:
+ if normalized_url.startswith(url_prefix):
+ print("MANUAL EXCLUSION SKIP : %s" % url)
+ print("--------------------------")
+ return
# ROBOTS
robots_file = get_robots_file(normalized_host)