commit 7ce414234e945fb99728b6714db48f02fac0e778
parent c2dd86ae92ca635f111d4b4c9a55464cff893a10
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sat, 16 May 2020 10:57:49 -0400
[crawl] Create non-destructive crawl option
Diffstat:
3 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,4 @@ dmypy.json
/seed-requests.txt
/index-statistics.csv
/statistics.csv
+/visited_urls.p
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ solution is that we need a way to create a mock index
sooner than later).
1. Get Python and [Poetry](https://python-poetry.org/docs/)
-2. Generate a local Geminispace index with `poetry run crawl`
+2. Generate a local Geminispace index with `poetry run crawl --destructive`
3. Serve GUS locally with `poetry run serve`
At this point you should be able to interact with a running
@@ -29,10 +29,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
- **log output of crawl**: I see some errors fly by, and it
would be nice to be able to review later and investigate.
-- **create non-destructive crawl**: it would be nice to be able to run
- the crawl in a non-destructive way that retains its memory of
- which sites it has already seen, and only adds new content to
- the index.
- **get crawl to run on a schedule with systemd**
- **add more statistics**: this could go in the index statistics
page, and, in addition to using the index itself, could also
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -1,6 +1,8 @@
+import argparse
from datetime import datetime, timedelta
import os
import pathlib
+import pickle
import re
import shutil
import time
@@ -286,10 +288,21 @@ def is_nontrivial_redirect(url, redirect_url):
return url.rstrip() != redirect_url.rstrip()
+def load_visited_urls():
+ return pickle.load( open( "visited_urls.p", "rb" ) )
+
+
+def persist_visited_urls(visited_urls):
+ pickle.dump( visited_urls, open( "visited_urls.p", "wb" ) )
+
+
def main():
- create_index(INDEX_DIR)
+ args = parse_args()
+ if args.should_run_destructive:
+ create_index(INDEX_DIR)
+
global visited_urls
- visited_urls = []
+ visited_urls = [] if args.should_run_destructive else load_visited_urls()
global robot_file_map
robot_file_map = {}
global domain_hit_timings
@@ -305,6 +318,8 @@ def main():
for url in SEED_URLS:
crawl_url(url)
+ persist_visited_urls(visited_urls)
+
index_statistics = compute_index_statistics("index")
print("Page count: {}".format(index_statistics["page_count"]))
print("Domain count: {}".format(index_statistics["domain_count"]))
@@ -320,5 +335,19 @@ def main():
persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
+def parse_args():
+ parser = argparse.ArgumentParser(description='Crawl Geminispace.')
+ parser.add_argument(
+ "--destructive",
+ "-d",
+ dest="should_run_destructive",
+ action="store_true",
+ help="create a fresh index and perform a full Geminispace crawl",
+ )
+ parser.set_defaults(should_run_destructive=False)
+ args = parser.parse_args()
+ return args
+
+
if __name__ == "__main__":
main()