commit 8d3cdb64054333f937395b0056617754477dbf8f
parent d71097373e04c47f1364e02153f263476a946104
Author: René Wagner <rwa@clttr.info>
Date: Sun, 8 May 2022 19:29:48 +0200
switch SQLite to WAL mode
Diffstat:
5 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -83,9 +83,7 @@ AND l.is_cross_host_like == 1""",
def build_index(should_run_destructive=False):
index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
- db = init_db(f"{index_dir}/{constants.DB_FILENAME}", pragmas={
- 'journal_mode': 'wal',
- 'cache_size': -128 * 1000})
+ db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
index = search.Index(index_dir, should_run_destructive)
if (should_run_destructive):
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -531,7 +531,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
global db
db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
-
global robot_file_map
robot_file_map = {}
global domain_hit_timings
@@ -550,6 +549,17 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
# delete pages with last crawl success older than 30 days which have been recrawled since than
# this avoids deletion of files that have a change_frequency longer than our timeout
+ #q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
+ #try:
+ # domains = q.execute()
+ # for del_domain in domains:
+ # logging.warn("Deleting pages for domain: %s", del_domain.domain)
+ # # Page.delete().where(Page.domain = domain)
+ #except Exception as e:
+ # logging.error("Failed to delete domains with outdated successful crawl: %s", e)
+
+ # delete pages with last crawl success older than 30 days which have been recrawled since than
+ # this avoids deletion of files that have a change_frequency longer than our timeout
q = Page.delete().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
try:
count = q.execute()
diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py
@@ -18,7 +18,11 @@ def init_db(filename=":memory:"):
Bind an SQLite database to the Peewee ORM models.
"""
models = [Link, Page]
- db = SqliteDatabase(filename)
+ db = SqliteDatabase(filename, pragmas={
+ 'journal_mode': 'wal',
+ 'cache_size': -128 * 1000,
+ 'foreign_keys': 1,
+ 'ignore_check_constraints': 0})
db.bind(models)
db.create_tables(models)
return db
diff --git a/infra/rebuild_index.sh b/infra/rebuild_index.sh
@@ -1,4 +1,6 @@
+sudo systemctl stop gus
cp -r /home/gus/index /home/gus/index.new
+sudo systemctl start gus
/home/gus/.poetry/bin/poetry run build_index -d
rm -rf /home/gus/index.old
mv /home/gus/index /home/gus/index.old
diff --git a/serve/models.py b/serve/models.py
@@ -17,9 +17,7 @@ TEXT_CONTENT_TYPE = ["text/plain", "text/gemini", "text/markdown"]
class GUS:
def __init__(self):
self.index = search.Index(constants.INDEX_DIR)
- self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}", pragmas={
- 'journal_mode': 'wal',
- 'cache_size': -128 * 1000})
+ self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}")
self.statistics = compute_index_statistics(self.db)
self.statistics_historical_overall = load_all_statistics_from_file(
constants.STATISTICS_FILE)
@@ -159,6 +157,6 @@ def process_seed_request(seed_request):
with open(constants.SEED_REQUEST_FILE, "a") as seed_file:
if seed_request.startswith("Gemini://"):
seed_request = seed_request.replace('G', 'g', 1)
- if not seed_request.startswith("gemini://":
+ if not seed_request.startswith("gemini://"):
seed_request = "gemini://{}".format(seed_request)
seed_file.write("{}\n".format(seed_request))