switch SQLite to WAL mode - geminispace.info

commit 8d3cdb64054333f937395b0056617754477dbf8f
parent d71097373e04c47f1364e02153f263476a946104
Author: René Wagner <rwa@clttr.info>
Date:   Sun,  8 May 2022 19:29:48 +0200

switch SQLite to WAL mode

Diffstat:
M gus/build_index.py  | 4 +---
M gus/crawl.py  | 12 +++++++++++-
M gus/lib/db_model.py  | 6 +++++-
M infra/rebuild_index.sh  | 2 ++
M serve/models.py  | 6 ++----

5 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -83,9 +83,7 @@ AND l.is_cross_host_like == 1""",
 def build_index(should_run_destructive=False):
     index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
 
-    db = init_db(f"{index_dir}/{constants.DB_FILENAME}", pragmas={
-        'journal_mode': 'wal',
-        'cache_size': -128 * 1000})
+    db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
     index = search.Index(index_dir, should_run_destructive)
 
     if (should_run_destructive):
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -531,7 +531,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
     pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
     global db
     db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
-    
     global robot_file_map
     robot_file_map = {}
     global domain_hit_timings
@@ -550,6 +549,17 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
 
     # delete pages with last crawl success older than 30 days which have been recrawled since than
     # this avoids deletion of files that have a change_frequency longer than our timeout
+    #q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
+    #try:
+    #    domains = q.execute()
+    #    for del_domain in domains:
+    #        logging.warn("Deleting pages for domain: %s", del_domain.domain)
+    #        # Page.delete().where(Page.domain = domain)
+    #except Exception as e:
+    #    logging.error("Failed to delete domains with outdated successful crawl: %s", e)
+
+    # delete pages with last crawl success older than 30 days which have been recrawled since than
+    # this avoids deletion of files that have a change_frequency longer than our timeout
     q = Page.delete().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
     try:
         count = q.execute()
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -18,7 +18,11 @@ def init_db(filename=":memory:"):
     Bind an SQLite database to the Peewee ORM models.
     """
     models = [Link, Page]
-    db = SqliteDatabase(filename)
+    db = SqliteDatabase(filename, pragmas={ 
+        'journal_mode': 'wal',
+        'cache_size': -128 * 1000,
+        'foreign_keys': 1,
+        'ignore_check_constraints': 0})
     db.bind(models)
     db.create_tables(models)
     return db
diff --git a/infra/rebuild_index.sh b/infra/rebuild_index.sh
@@ -1,4 +1,6 @@
+sudo systemctl stop gus
 cp -r /home/gus/index /home/gus/index.new
+sudo systemctl start gus
 /home/gus/.poetry/bin/poetry run build_index -d
 rm -rf /home/gus/index.old
 mv /home/gus/index /home/gus/index.old
diff --git a/serve/models.py b/serve/models.py
@@ -17,9 +17,7 @@ TEXT_CONTENT_TYPE = ["text/plain", "text/gemini", "text/markdown"]
 class GUS:
     def __init__(self):
         self.index = search.Index(constants.INDEX_DIR)
-        self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}", pragmas={
-        'journal_mode': 'wal',
-        'cache_size': -128 * 1000})
+        self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}")
         self.statistics = compute_index_statistics(self.db)
         self.statistics_historical_overall = load_all_statistics_from_file(
             constants.STATISTICS_FILE)
@@ -159,6 +157,6 @@ def process_seed_request(seed_request):
     with open(constants.SEED_REQUEST_FILE, "a") as seed_file:
         if seed_request.startswith("Gemini://"):
             seed_request = seed_request.replace('G', 'g', 1)
-        if not seed_request.startswith("gemini://":
+        if not seed_request.startswith("gemini://"):
             seed_request = "gemini://{}".format(seed_request)
         seed_file.write("{}\n".format(seed_request))

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	4	+---
M	gus/crawl.py	\|	12	+++++++++++-
M	gus/lib/db_model.py	\|	6	+++++-
M	infra/rebuild_index.sh	\|	2	++
M	serve/models.py	\|	6	++----