commit 72c6ccbf81941ce81541bfbbf5eb8f03ebf77b61
parent c0210d90cf3d599060d58e6972fb6272ea322237
Author: Natalie Pendragon <natpen@natpen.net>
Date: Wed, 28 Oct 2020 06:50:06 -0400
[build_index] Perform prefix-based URL exclusion during index build
Previously this exclusion only happened while performing the crawl,
but for a number of reasons, pages have ended up in the database that
should be excluded from the index. Some due to user error, some due to
the exclusion list growing over time.
The fact that they're still in the database means they are probably
impacting db-based calculations, so longer-term there probably should
be some sort of pruning process or something to keep the db entries
pared down to only what we care about.
Even after adding such pruning functionality though, I think this
changeset would still be valuable to ensure the index only gets valid
entries.
Diffstat:
1 file changed, 8 insertions(+), 0 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -29,6 +29,7 @@ from whoosh.query import Every
from whoosh.qparser import QueryParser
from whoosh.writing import BufferedWriter
+from gus.crawl import EXCLUDED_URL_PREFIXES
from gus.lib.db_model import init_db, Page, Link, Crawl
from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics
@@ -113,6 +114,13 @@ def create_index(index_dir):
def index_page(page, indexed_urls):
+ should_skip = False
+ for excluded_prefix in EXCLUDED_URL_PREFIXES:
+ if page.normalized_url.startswith(excluded_prefix):
+ should_skip = True
+ break
+ if should_skip:
+ return
if page.fetchable_url in indexed_urls:
return
print(page.url)