commit a7ea73424895a1e73d0bcc7ea2dc1e6d28257ec6
parent cde47da62cb626be45196e66f8f5b8125a4b4827
Author: Natalie Pendragon <natpen@natpen.net>
Date: Fri, 22 May 2020 07:31:18 -0400
[crawl] pickle and unpickle the robot_file_map
This way we don't have to re-request all the robots.txt files during
incremental crawls
Diffstat:
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -132,6 +132,4 @@ dmypy.json
/index.bak/
/seed-requests.txt
-/index-statistics.csv
/statistics.csv
-/visited_urls.p
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -319,6 +319,17 @@ def load_visited_urls(index_dir):
return visited_urls
+def pickle_robot_file_map(robot_file_map, index_dir):
+ pickle.dump(robot_file_map, open(index_dir + "/robot_file_map.p", "wb"))
+
+
+def unpickle_robot_file_map(index_dir):
+ if not os.path.isfile(index_dir + "/robot_file_map.p"):
+ print("Unable to find robot_file_map.p file to unpickle")
+ return {}
+ return pickle.load(open(index_dir + "/robot_file_map.p", "rb"))
+
+
def load_seed_request_urls():
with open("seed-requests.txt") as f:
content = f.readlines()
@@ -341,7 +352,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
global visited_urls
visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT)
global robot_file_map
- robot_file_map = {}
+ robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT)
global domain_hit_timings
domain_hit_timings = {}
global crawl_statistics
@@ -364,6 +375,8 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
for resource in seed_request_resources:
crawl(resource)
+ pickle_robot_file_map(robot_file_map, index_dir)
+
index_statistics = compute_index_statistics(index_dir)
print_index_statistics(index_statistics, crawl_statistics)
if should_run_destructive: