[crawl] Create non-destructive crawl option - geminispace.info

commit 7ce414234e945fb99728b6714db48f02fac0e778
parent c2dd86ae92ca635f111d4b4c9a55464cff893a10
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sat, 16 May 2020 10:57:49 -0400

[crawl] Create non-destructive crawl option

Diffstat:
M .gitignore  | 1 +
M README.md  | 6 +-----
M gus/crawl.py  | 33 +++++++++++++++++++++++++++++++--

3 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,4 @@ dmypy.json
 /seed-requests.txt
 /index-statistics.csv
 /statistics.csv
+/visited_urls.p
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ solution is that we need a way to create a mock index
 sooner than later).
 
 1. Get Python and [Poetry](https://python-poetry.org/docs/)
-2. Generate a local Geminispace index with `poetry run crawl`
+2. Generate a local Geminispace index with `poetry run crawl --destructive`
 3. Serve GUS locally with `poetry run serve`
 
 At this point you should be able to interact with a running
@@ -29,10 +29,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
 
 - **log output of crawl**: I see some errors fly by, and it
   would be nice to be able to review later and investigate.
-- **create non-destructive crawl**: it would be nice to be able to run
-  the crawl in a non-destructive way that retains its memory of
-  which sites it has already seen, and only adds new content to
-  the index.
 - **get crawl to run on a schedule with systemd**
 - **add more statistics**: this could go in the index statistics
   page, and, in addition to using the index itself, could also
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -1,6 +1,8 @@
+import argparse
 from datetime import datetime, timedelta
 import os
 import pathlib
+import pickle
 import re
 import shutil
 import time
@@ -286,10 +288,21 @@ def is_nontrivial_redirect(url, redirect_url):
     return url.rstrip() != redirect_url.rstrip()
 
 
+def load_visited_urls():
+    return pickle.load( open( "visited_urls.p", "rb" ) )
+
+
+def persist_visited_urls(visited_urls):
+    pickle.dump( visited_urls, open( "visited_urls.p", "wb" ) )
+
+
 def main():
-    create_index(INDEX_DIR)
+    args = parse_args()
+    if args.should_run_destructive:
+        create_index(INDEX_DIR)
+
     global visited_urls
-    visited_urls = []
+    visited_urls = [] if args.should_run_destructive else load_visited_urls()
     global robot_file_map
     robot_file_map = {}
     global domain_hit_timings
@@ -305,6 +318,8 @@ def main():
     for url in SEED_URLS:
         crawl_url(url)
 
+    persist_visited_urls(visited_urls)
+
     index_statistics = compute_index_statistics("index")
     print("Page count: {}".format(index_statistics["page_count"]))
     print("Domain count: {}".format(index_statistics["domain_count"]))
@@ -320,5 +335,19 @@ def main():
     persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
 
 
+def parse_args():
+    parser = argparse.ArgumentParser(description='Crawl Geminispace.')
+    parser.add_argument(
+        "--destructive",
+        "-d",
+        dest="should_run_destructive",
+        action="store_true",
+        help="create a fresh index and perform a full Geminispace crawl",
+    )
+    parser.set_defaults(should_run_destructive=False)
+    args = parser.parse_args()
+    return args
+
+
 if __name__ == "__main__":
     main()

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	.gitignore	\|	1	+
M	README.md	\|	6	+-----
M	gus/crawl.py	\|	33	+++++++++++++++++++++++++++++++--