geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit c2dd594c459e62612c9cd57824ee031a8a21f3a1
parent 1e63d8b307a42230db0a7e3fe2b2db9abcf2b608
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon,  2 Nov 2020 08:38:46 -0500

Fix the index build

Diffstat:
Mgus/build_index.py | 42++++++++++++++++++++++++------------------
Mgus/crawl.py | 6++++++
Mgus/lib/logging.py | 4++++
3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -102,11 +102,13 @@ def index_page(page, indexed_urls): should_skip = True break if should_skip: - return + logging.debug('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(page.url)) + return False if page.fetchable_url in indexed_urls: - return + logging.debug('Page already indexed, skipping: %s', gus.lib.logging.strip_control_chars(page.url)) + return False - logging.info("Indexing page: %s", page.url) + logging.info("Indexing page: %s", gus.lib.logging.strip_control_chars(page.url)) u = page.url.rstrip("/") external_backlinks = Page.raw("""SELECT p_from.url @@ -123,6 +125,7 @@ GROUP BY p_from.normalized_url""", u, f"{u}/") backlink_urls = [b.url for b in external_backlinks.execute()] backlink_count = len(backlink_urls) + logging.info("Indexing page: %s", logging.strip_control_chars(page.url)) document = { "url_id": page.url, @@ -139,8 +142,12 @@ GROUP BY p_from.normalized_url""", u, f"{u}/") "prompt": page.prompt, "content": page.content, } - index_writer.add_document(**document) - + try: + index_writer.add_document(**document) + return True + except: + logging.warn("Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url)) + return False def load_indexed_urls(index_dir): indexed_urls = [] @@ -185,22 +192,21 @@ ON p.id == c.page_id GROUP BY p.normalized_url""") i = 0 - for page in pages.execute(): - index_page(page, indexed_urls) - i += 1 - # NOTE(np): Whoosh's index writing doesn't seem to do any - # intermediate flushing of index segments to disk, which - # resulted in OOM errors as Geminispace has grown. This bit of - # code should force it to flush segments to disk every 1000 - # documents, which should scale well with Geminispace going - # forward. - if i % 1000 == 0: + for page in pages.iterator(): + was_indexed = index_page(page, indexed_urls) + if was_indexed: + i += 1 + # NOTE(np): Whoosh's index writing doesn't do any intermediate + # flushing of index segments to disk, which can cause OOM + # errors as Geminispace has grown. This bit of code will force + # it to flush segments to disk every 5000 documents, which + # should scale well with Geminispace going forward. + if i % 5000 == 0: logging.debug('Committing index.') index_writer.commit() index_writer = ix.writer() - if i % 1000 != 0: - logging.debug('Committing index.') - index_writer.commit() + logging.debug('Committing index for the last time.') + index_writer.commit() index_statistics = compute_index_statistics(db) log_index_statistics(index_statistics) diff --git a/gus/crawl.py b/gus/crawl.py @@ -132,6 +132,12 @@ EXCLUDED_URL_PREFIXES = [ # this is a stream that never ends... "gemini://gemini.thebackupbox.net/radio", + # this page inexplicably breaks both build_index, as well as elpher + # when I browse to it... I think it might have some weird encoding + # issues in its content or something, but that's a problem for a + # different day + "gemini://gemini.spam.works/users/dvn/archive/", + ] EXCLUDED_URL_PATHS = [ diff --git a/gus/lib/logging.py b/gus/lib/logging.py @@ -27,3 +27,7 @@ def handle_arguments(args): elif os.path.isfile('logging.ini'): logging.config.fileConfig('logging.ini') + + +def strip_control_chars(s): + return "".join(i for i in s if 31 < ord(i) < 127)