geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 8d3cdb64054333f937395b0056617754477dbf8f
parent d71097373e04c47f1364e02153f263476a946104
Author: René Wagner <rwa@clttr.info>
Date:   Sun,  8 May 2022 19:29:48 +0200

switch SQLite to WAL mode

Diffstat:
Mgus/build_index.py | 4+---
Mgus/crawl.py | 12+++++++++++-
Mgus/lib/db_model.py | 6+++++-
Minfra/rebuild_index.sh | 2++
Mserve/models.py | 6++----
5 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -83,9 +83,7 @@ AND l.is_cross_host_like == 1""", def build_index(should_run_destructive=False): index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR - db = init_db(f"{index_dir}/{constants.DB_FILENAME}", pragmas={ - 'journal_mode': 'wal', - 'cache_size': -128 * 1000}) + db = init_db(f"{index_dir}/{constants.DB_FILENAME}") index = search.Index(index_dir, should_run_destructive) if (should_run_destructive): diff --git a/gus/crawl.py b/gus/crawl.py @@ -531,7 +531,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) global db db = init_db(f"{index_dir}/{constants.DB_FILENAME}") - global robot_file_map robot_file_map = {} global domain_hit_timings @@ -550,6 +549,17 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): # delete pages with last crawl success older than 30 days which have been recrawled since than # this avoids deletion of files that have a change_frequency longer than our timeout + #q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) + #try: + # domains = q.execute() + # for del_domain in domains: + # logging.warn("Deleting pages for domain: %s", del_domain.domain) + # # Page.delete().where(Page.domain = domain) + #except Exception as e: + # logging.error("Failed to delete domains with outdated successful crawl: %s", e) + + # delete pages with last crawl success older than 30 days which have been recrawled since than + # this avoids deletion of files that have a change_frequency longer than our timeout q = Page.delete().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) try: count = q.execute() diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -18,7 +18,11 @@ def init_db(filename=":memory:"): Bind an SQLite database to the Peewee ORM models. """ models = [Link, Page] - db = SqliteDatabase(filename) + db = SqliteDatabase(filename, pragmas={ + 'journal_mode': 'wal', + 'cache_size': -128 * 1000, + 'foreign_keys': 1, + 'ignore_check_constraints': 0}) db.bind(models) db.create_tables(models) return db diff --git a/infra/rebuild_index.sh b/infra/rebuild_index.sh @@ -1,4 +1,6 @@ +sudo systemctl stop gus cp -r /home/gus/index /home/gus/index.new +sudo systemctl start gus /home/gus/.poetry/bin/poetry run build_index -d rm -rf /home/gus/index.old mv /home/gus/index /home/gus/index.old diff --git a/serve/models.py b/serve/models.py @@ -17,9 +17,7 @@ TEXT_CONTENT_TYPE = ["text/plain", "text/gemini", "text/markdown"] class GUS: def __init__(self): self.index = search.Index(constants.INDEX_DIR) - self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}", pragmas={ - 'journal_mode': 'wal', - 'cache_size': -128 * 1000}) + self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}") self.statistics = compute_index_statistics(self.db) self.statistics_historical_overall = load_all_statistics_from_file( constants.STATISTICS_FILE) @@ -159,6 +157,6 @@ def process_seed_request(seed_request): with open(constants.SEED_REQUEST_FILE, "a") as seed_file: if seed_request.startswith("Gemini://"): seed_request = seed_request.replace('G', 'g', 1) - if not seed_request.startswith("gemini://": + if not seed_request.startswith("gemini://"): seed_request = "gemini://{}".format(seed_request) seed_file.write("{}\n".format(seed_request))