commit c2dd594c459e62612c9cd57824ee031a8a21f3a1
parent 1e63d8b307a42230db0a7e3fe2b2db9abcf2b608
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 2 Nov 2020 08:38:46 -0500
Fix the index build
Diffstat:
3 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -102,11 +102,13 @@ def index_page(page, indexed_urls):
should_skip = True
break
if should_skip:
- return
+ logging.debug('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(page.url))
+ return False
if page.fetchable_url in indexed_urls:
- return
+ logging.debug('Page already indexed, skipping: %s', gus.lib.logging.strip_control_chars(page.url))
+ return False
- logging.info("Indexing page: %s", page.url)
+ logging.info("Indexing page: %s", gus.lib.logging.strip_control_chars(page.url))
u = page.url.rstrip("/")
external_backlinks = Page.raw("""SELECT p_from.url
@@ -123,6 +125,7 @@ GROUP BY p_from.normalized_url""", u, f"{u}/")
backlink_urls = [b.url for b in external_backlinks.execute()]
backlink_count = len(backlink_urls)
+ logging.info("Indexing page: %s", logging.strip_control_chars(page.url))
document = {
"url_id": page.url,
@@ -139,8 +142,12 @@ GROUP BY p_from.normalized_url""", u, f"{u}/")
"prompt": page.prompt,
"content": page.content,
}
- index_writer.add_document(**document)
-
+ try:
+ index_writer.add_document(**document)
+ return True
+ except:
+ logging.warn("Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url))
+ return False
def load_indexed_urls(index_dir):
indexed_urls = []
@@ -185,22 +192,21 @@ ON p.id == c.page_id
GROUP BY p.normalized_url""")
i = 0
- for page in pages.execute():
- index_page(page, indexed_urls)
- i += 1
- # NOTE(np): Whoosh's index writing doesn't seem to do any
- # intermediate flushing of index segments to disk, which
- # resulted in OOM errors as Geminispace has grown. This bit of
- # code should force it to flush segments to disk every 1000
- # documents, which should scale well with Geminispace going
- # forward.
- if i % 1000 == 0:
+ for page in pages.iterator():
+ was_indexed = index_page(page, indexed_urls)
+ if was_indexed:
+ i += 1
+ # NOTE(np): Whoosh's index writing doesn't do any intermediate
+ # flushing of index segments to disk, which can cause OOM
+ # errors as Geminispace has grown. This bit of code will force
+ # it to flush segments to disk every 5000 documents, which
+ # should scale well with Geminispace going forward.
+ if i % 5000 == 0:
logging.debug('Committing index.')
index_writer.commit()
index_writer = ix.writer()
- if i % 1000 != 0:
- logging.debug('Committing index.')
- index_writer.commit()
+ logging.debug('Committing index for the last time.')
+ index_writer.commit()
index_statistics = compute_index_statistics(db)
log_index_statistics(index_statistics)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -132,6 +132,12 @@ EXCLUDED_URL_PREFIXES = [
# this is a stream that never ends...
"gemini://gemini.thebackupbox.net/radio",
+ # this page inexplicably breaks both build_index, as well as elpher
+ # when I browse to it... I think it might have some weird encoding
+ # issues in its content or something, but that's a problem for a
+ # different day
+ "gemini://gemini.spam.works/users/dvn/archive/",
+
]
EXCLUDED_URL_PATHS = [
diff --git a/gus/lib/logging.py b/gus/lib/logging.py
@@ -27,3 +27,7 @@ def handle_arguments(args):
elif os.path.isfile('logging.ini'):
logging.config.fileConfig('logging.ini')
+
+
+def strip_control_chars(s):
+ return "".join(i for i in s if 31 < ord(i) < 127)