geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit a9806b3f479e017f0555372d95d591ad33f42cf2
parent a8a1abd487ad5dd0587d680a136dda0a8b1fd7a7
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed, 15 Jul 2020 09:09:39 -0400

[crawl] Rebuild link table completely and idempotently

Diffstat:
Mgus/build_index.py | 2+-
Mgus/crawl.py | 12+++++++++++-
2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -203,7 +203,7 @@ def parse_args(): dest="should_run_destructive", action="store_true", default=False, - help="create a fresh index and perform a full Geminispace crawl", + help="create a fresh index", ) parser.add_argument( "--invalidation_window", diff --git a/gus/crawl.py b/gus/crawl.py @@ -264,6 +264,15 @@ def index_links(from_resource, contained_resources): Link.insert_many(data).execute() +def rebuild_link_table(): + Link.delete().execute() + pages = Page.select().where(Page.content_type == "text/gemini", Page.indexed_at.is_null(False)) + for page in pages: + resource = GeminiResource(page.fetchable_url) + contained_resources = resource.extract_contained_resources(page.content) + index_links(resource, contained_resources) + + def get_robots_file(robot_host): if robot_host not in robot_file_map: print("Requesting robots.txt for {}".format(robot_host)) @@ -382,7 +391,6 @@ def crawl(gemini_resource, current_depth, redirect_chain=[]): print("Extracting contained resources...") print("--------------------------") contained_resources = gr.extract_contained_resources(response.content) - index_links(gr, contained_resources) for resource in contained_resources: crawl(resource, current_depth+1) else: @@ -499,6 +507,7 @@ def recrawl_feeds(): seed_resources = [GeminiResource(url) for url in content_urls] for resource in seed_resources: crawl(resource, 0) + rebuild_link_table() pickle_robot_file_map(robot_file_map, index_dir) print(content_urls) @@ -534,6 +543,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): seed_request_resources = [GeminiResource(url) for url in seed_request_urls] for resource in seed_request_resources: crawl(resource, 0) + rebuild_link_table() pickle_robot_file_map(robot_file_map, index_dir) print("Finished!")