commit 97b15eaa87fcd5ba7604fd540e64767fdfe6b04c
parent 22d4dcaa8c27660b6845521525b1882af3cf1a20
Author: Natalie Pendragon <natpen@natpen.net>
Date: Tue, 19 May 2020 06:47:51 -0400
[crawl] Crawl the seed requests after the main crawl
Diffstat:
1 file changed, 15 insertions(+), 0 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -292,6 +292,14 @@ def persist_visited_urls(visited_urls):
pickle.dump( visited_urls, open( "visited_urls.p", "wb" ) )
+def load_seed_request_urls():
+ with open("seed-requests.txt") as f:
+ content = f.readlines()
+ # remove whitespace characters like `\n` at the end of each line
+ content = [x.strip() for x in content]
+ return content
+
+
def run_crawl(should_run_destructive=False, seed_urls=[]):
# TODO: track failed domain/page attempts, and don't reattempt for 15mins
if should_run_destructive:
@@ -320,6 +328,13 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
for resource in seed_resources:
crawl(resource)
+ # after full crawl, crawl the seed requests as well in case there is
+ # anything new
+ seed_request_urls = load_seed_request_urls()
+ seed_request_resources = [GeminiResource(url) for url in seed_request_urls]
+ for resource in seed_request_resources:
+ crawl(resource)
+
persist_visited_urls(visited_urls)
index_statistics = compute_index_statistics("index")