Fix the index build - geminispace.info

commit c2dd594c459e62612c9cd57824ee031a8a21f3a1
parent 1e63d8b307a42230db0a7e3fe2b2db9abcf2b608
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon,  2 Nov 2020 08:38:46 -0500

Fix the index build

Diffstat:
M gus/build_index.py  | 42 ++++++++++++++++++++++++------------------
M gus/crawl.py  | 6 ++++++
M gus/lib/logging.py  | 4 ++++

3 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -102,11 +102,13 @@ def index_page(page, indexed_urls):
             should_skip = True
             break
     if should_skip:
-        return
+        logging.debug('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(page.url))
+        return False
     if page.fetchable_url in indexed_urls:
-        return
+        logging.debug('Page already indexed, skipping: %s', gus.lib.logging.strip_control_chars(page.url))
+        return False
 
-    logging.info("Indexing page: %s", page.url)
+    logging.info("Indexing page: %s", gus.lib.logging.strip_control_chars(page.url))
 
     u = page.url.rstrip("/")
     external_backlinks = Page.raw("""SELECT p_from.url
@@ -123,6 +125,7 @@ GROUP BY p_from.normalized_url""", u, f"{u}/")
 
     backlink_urls = [b.url for b in external_backlinks.execute()]
     backlink_count = len(backlink_urls)
+    logging.info("Indexing page: %s", logging.strip_control_chars(page.url))
 
     document = {
         "url_id": page.url,
@@ -139,8 +142,12 @@ GROUP BY p_from.normalized_url""", u, f"{u}/")
         "prompt": page.prompt,
         "content": page.content,
     }
-    index_writer.add_document(**document)
-
+    try:
+        index_writer.add_document(**document)
+        return True
+    except:
+        logging.warn("Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url))
+        return False
 
 def load_indexed_urls(index_dir):
     indexed_urls = []
@@ -185,22 +192,21 @@ ON p.id == c.page_id
 GROUP BY p.normalized_url""")
 
     i = 0
-    for page in pages.execute():
-        index_page(page, indexed_urls)
-        i += 1
-        # NOTE(np): Whoosh's index writing doesn't seem to do any
-        # intermediate flushing of index segments to disk, which
-        # resulted in OOM errors as Geminispace has grown. This bit of
-        # code should force it to flush segments to disk every 1000
-        # documents, which should scale well with Geminispace going
-        # forward.
-        if i % 1000 == 0:
+    for page in pages.iterator():
+        was_indexed = index_page(page, indexed_urls)
+        if was_indexed:
+            i += 1
+        # NOTE(np): Whoosh's index writing doesn't do any intermediate
+        # flushing of index segments to disk, which can cause OOM
+        # errors as Geminispace has grown. This bit of code will force
+        # it to flush segments to disk every 5000 documents, which
+        # should scale well with Geminispace going forward.
+        if i % 5000 == 0:
             logging.debug('Committing index.')
             index_writer.commit()
             index_writer = ix.writer()
-    if i % 1000 != 0:
-        logging.debug('Committing index.')
-        index_writer.commit()
+    logging.debug('Committing index for the last time.')
+    index_writer.commit()
 
     index_statistics = compute_index_statistics(db)
     log_index_statistics(index_statistics)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -132,6 +132,12 @@ EXCLUDED_URL_PREFIXES = [
     # this is a stream that never ends...
     "gemini://gemini.thebackupbox.net/radio",
 
+    # this page inexplicably breaks both build_index, as well as elpher
+    # when I browse to it... I think it might have some weird encoding
+    # issues in its content or something, but that's a problem for a
+    # different day
+    "gemini://gemini.spam.works/users/dvn/archive/",
+
 ]
 
 EXCLUDED_URL_PATHS = [
diff --git a/gus/lib/logging.py b/gus/lib/logging.py
@@ -27,3 +27,7 @@ def handle_arguments(args):
 
     elif os.path.isfile('logging.ini'):
         logging.config.fileConfig('logging.ini')
+
+
+def strip_control_chars(s):
+    return "".join(i for i in s if 31 < ord(i) < 127)

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	42	++++++++++++++++++++++++------------------
M	gus/crawl.py	\|	6	++++++
M	gus/lib/logging.py	\|	4	++++