commit a9806b3f479e017f0555372d95d591ad33f42cf2
parent a8a1abd487ad5dd0587d680a136dda0a8b1fd7a7
Author: Natalie Pendragon <natpen@natpen.net>
Date: Wed, 15 Jul 2020 09:09:39 -0400
[crawl] Rebuild link table completely and idempotently
Diffstat:
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -203,7 +203,7 @@ def parse_args():
dest="should_run_destructive",
action="store_true",
default=False,
- help="create a fresh index and perform a full Geminispace crawl",
+ help="create a fresh index",
)
parser.add_argument(
"--invalidation_window",
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -264,6 +264,15 @@ def index_links(from_resource, contained_resources):
Link.insert_many(data).execute()
+def rebuild_link_table():
+ Link.delete().execute()
+ pages = Page.select().where(Page.content_type == "text/gemini", Page.indexed_at.is_null(False))
+ for page in pages:
+ resource = GeminiResource(page.fetchable_url)
+ contained_resources = resource.extract_contained_resources(page.content)
+ index_links(resource, contained_resources)
+
+
def get_robots_file(robot_host):
if robot_host not in robot_file_map:
print("Requesting robots.txt for {}".format(robot_host))
@@ -382,7 +391,6 @@ def crawl(gemini_resource, current_depth, redirect_chain=[]):
print("Extracting contained resources...")
print("--------------------------")
contained_resources = gr.extract_contained_resources(response.content)
- index_links(gr, contained_resources)
for resource in contained_resources:
crawl(resource, current_depth+1)
else:
@@ -499,6 +507,7 @@ def recrawl_feeds():
seed_resources = [GeminiResource(url) for url in content_urls]
for resource in seed_resources:
crawl(resource, 0)
+ rebuild_link_table()
pickle_robot_file_map(robot_file_map, index_dir)
print(content_urls)
@@ -534,6 +543,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
seed_request_resources = [GeminiResource(url) for url in seed_request_urls]
for resource in seed_request_resources:
crawl(resource, 0)
+ rebuild_link_table()
pickle_robot_file_map(robot_file_map, index_dir)
print("Finished!")