geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 51729e59c9466cc940a141fddff740df45116028
parent 1a9a58abf2bb4e3fb3fc35b5ba6507289411f446
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed, 13 May 2020 09:51:06 -0400

[crawl] Refactor manual exclusions and add fgaz' calculator

The calculator seems to generate links dynamically, so attempting to
crawl it will yield unending pages with links to more deeply-nested
mathematical operations.

Diffstat:
Mgus/crawl.py | 17+++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -52,6 +52,13 @@ SEED_URLS = [ "gemini://zaibatsu.circumlunar.space", ] +EXCLUDED_URL_PREFIXES = [ + "gemini://example.org", + "gemini://example.com", + "gemini://gemini.conman.org/test", + "gemini://gemini.circumlunar.space/users/fgaz/calculator/", +] + def backup_old_index(index_dir, backup_dir): last_index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir)) @@ -178,13 +185,15 @@ def crawl_url(url): url = urlunparse(u) path = u.path.lower().rstrip().rstrip('/') normalized_url, normalized_host = normalize_gemini_url(url) - if normalized_url is None or \ - normalized_url.startswith("gemini://example.org") or \ - normalized_url.startswith("gemini://example.com") or \ - normalized_url.startswith("gemini://gemini.conman.org/test"): + if normalized_url is None: print("MANUAL EXCLUSION SKIP : %s" % url) print("--------------------------") return + for url_prefix in EXCLUDED_URL_PREFIXES: + if normalized_url.startswith(url_prefix): + print("MANUAL EXCLUSION SKIP : %s" % url) + print("--------------------------") + return # ROBOTS robots_file = get_robots_file(normalized_host)