commit aa3fdeaefb1f80aa0838c2ea62b8e73f6e832d40 parent 2774ab2b0ec0d9c6b0f444919cd053507d6f4a86 Author: Natalie Pendragon <natpen@natpen.net> Date: Sat, 31 Oct 2020 10:06:14 -0400 [build_index] Flush index segments to disk periodically Diffstat:
M | gus/build_index.py | | | 18 | +++++++++++++++--- |
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py @@ -184,11 +184,23 @@ JOIN page AS p ON p.id == c.page_id GROUP BY p.normalized_url""") + i = 0 for page in pages.execute(): index_page(page, indexed_urls) - - logging.debug('Committing index.') - index_writer.commit() + i += 1 + # NOTE(np): Whoosh's index writing doesn't seem to do any + # intermediate flushing of index segments to disk, which + # resulted in OOM errors as Geminispace has grown. This bit of + # code should force it to flush segments to disk every 1000 + # documents, which should scale well with Geminispace going + # forward. + if i % 1000 == 0: + logging.debug('Committing index.') + index_writer.commit() + index_writer = ix.writer() + if i % 1000 != 0: + logging.debug('Committing index.') + index_writer.commit() index_statistics = compute_index_statistics(db) log_index_statistics(index_statistics)