geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit a7ea73424895a1e73d0bcc7ea2dc1e6d28257ec6
parent cde47da62cb626be45196e66f8f5b8125a4b4827
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri, 22 May 2020 07:31:18 -0400

[crawl] pickle and unpickle the robot_file_map

This way we don't have to re-request all the robots.txt files during
incremental crawls

Diffstat:
M.gitignore | 2--
Mgus/crawl.py | 15++++++++++++++-
2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -132,6 +132,4 @@ dmypy.json /index.bak/ /seed-requests.txt -/index-statistics.csv /statistics.csv -/visited_urls.p diff --git a/gus/crawl.py b/gus/crawl.py @@ -319,6 +319,17 @@ def load_visited_urls(index_dir): return visited_urls +def pickle_robot_file_map(robot_file_map, index_dir): + pickle.dump(robot_file_map, open(index_dir + "/robot_file_map.p", "wb")) + + +def unpickle_robot_file_map(index_dir): + if not os.path.isfile(index_dir + "/robot_file_map.p"): + print("Unable to find robot_file_map.p file to unpickle") + return {} + return pickle.load(open(index_dir + "/robot_file_map.p", "rb")) + + def load_seed_request_urls(): with open("seed-requests.txt") as f: content = f.readlines() @@ -341,7 +352,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): global visited_urls visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT) global robot_file_map - robot_file_map = {} + robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT) global domain_hit_timings domain_hit_timings = {} global crawl_statistics @@ -364,6 +375,8 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): for resource in seed_request_resources: crawl(resource) + pickle_robot_file_map(robot_file_map, index_dir) + index_statistics = compute_index_statistics(index_dir) print_index_statistics(index_statistics, crawl_statistics) if should_run_destructive: