geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit aa3fdeaefb1f80aa0838c2ea62b8e73f6e832d40
parent 2774ab2b0ec0d9c6b0f444919cd053507d6f4a86
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sat, 31 Oct 2020 10:06:14 -0400

[build_index] Flush index segments to disk periodically

Diffstat:
Mgus/build_index.py | 18+++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -184,11 +184,23 @@ JOIN page AS p ON p.id == c.page_id GROUP BY p.normalized_url""") + i = 0 for page in pages.execute(): index_page(page, indexed_urls) - - logging.debug('Committing index.') - index_writer.commit() + i += 1 + # NOTE(np): Whoosh's index writing doesn't seem to do any + # intermediate flushing of index segments to disk, which + # resulted in OOM errors as Geminispace has grown. This bit of + # code should force it to flush segments to disk every 1000 + # documents, which should scale well with Geminispace going + # forward. + if i % 1000 == 0: + logging.debug('Committing index.') + index_writer.commit() + index_writer = ix.writer() + if i % 1000 != 0: + logging.debug('Committing index.') + index_writer.commit() index_statistics = compute_index_statistics(db) log_index_statistics(index_statistics)