[crawl] Rebuild link table completely and idempotently - geminispace.info

commit a9806b3f479e017f0555372d95d591ad33f42cf2
parent a8a1abd487ad5dd0587d680a136dda0a8b1fd7a7
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed, 15 Jul 2020 09:09:39 -0400

[crawl] Rebuild link table completely and idempotently

Diffstat:
M gus/build_index.py  | 2 +-
M gus/crawl.py  | 12 +++++++++++-

2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -203,7 +203,7 @@ def parse_args():
         dest="should_run_destructive",
         action="store_true",
         default=False,
-        help="create a fresh index and perform a full Geminispace crawl",
+        help="create a fresh index",
     )
     parser.add_argument(
         "--invalidation_window",
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -264,6 +264,15 @@ def index_links(from_resource, contained_resources):
     Link.insert_many(data).execute()
 
 
+def rebuild_link_table():
+    Link.delete().execute()
+    pages = Page.select().where(Page.content_type == "text/gemini", Page.indexed_at.is_null(False))
+    for page in pages:
+        resource = GeminiResource(page.fetchable_url)
+        contained_resources = resource.extract_contained_resources(page.content)
+        index_links(resource, contained_resources)
+
+
 def get_robots_file(robot_host):
     if robot_host not in robot_file_map:
         print("Requesting robots.txt for {}".format(robot_host))
@@ -382,7 +391,6 @@ def crawl(gemini_resource, current_depth, redirect_chain=[]):
                 print("Extracting contained resources...")
                 print("--------------------------")
                 contained_resources = gr.extract_contained_resources(response.content)
-                index_links(gr, contained_resources)
                 for resource in contained_resources:
                     crawl(resource, current_depth+1)
         else:
@@ -499,6 +507,7 @@ def recrawl_feeds():
     seed_resources = [GeminiResource(url) for url in content_urls]
     for resource in seed_resources:
         crawl(resource, 0)
+    rebuild_link_table()
 
     pickle_robot_file_map(robot_file_map, index_dir)
     print(content_urls)
@@ -534,6 +543,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
     seed_request_resources = [GeminiResource(url) for url in seed_request_urls]
     for resource in seed_request_resources:
         crawl(resource, 0)
+    rebuild_link_table()
 
     pickle_robot_file_map(robot_file_map, index_dir)
     print("Finished!")

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE