geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 97b15eaa87fcd5ba7604fd540e64767fdfe6b04c
parent 22d4dcaa8c27660b6845521525b1882af3cf1a20
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 19 May 2020 06:47:51 -0400

[crawl] Crawl the seed requests after the main crawl

Diffstat:
Mgus/crawl.py | 15+++++++++++++++
1 file changed, 15 insertions(+), 0 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -292,6 +292,14 @@ def persist_visited_urls(visited_urls): pickle.dump( visited_urls, open( "visited_urls.p", "wb" ) ) +def load_seed_request_urls(): + with open("seed-requests.txt") as f: + content = f.readlines() + # remove whitespace characters like `\n` at the end of each line + content = [x.strip() for x in content] + return content + + def run_crawl(should_run_destructive=False, seed_urls=[]): # TODO: track failed domain/page attempts, and don't reattempt for 15mins if should_run_destructive: @@ -320,6 +328,13 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): for resource in seed_resources: crawl(resource) + # after full crawl, crawl the seed requests as well in case there is + # anything new + seed_request_urls = load_seed_request_urls() + seed_request_resources = [GeminiResource(url) for url in seed_request_urls] + for resource in seed_request_resources: + crawl(resource) + persist_visited_urls(visited_urls) index_statistics = compute_index_statistics("index")