commit 7e5882ef498044f0949403427e3c3de41ebaba65
parent 1a1650dd7e619ef2102e5f445e1d079b06b689de
Author: Natalie Pendragon <natpen@natpen.net>
Date: Mon, 16 Nov 2020 07:49:37 -0500
[build_index] Only index text pages <= 1KB in size
Diffstat:
3 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -114,7 +114,9 @@ def build_index(should_run_destructive=False, invalidation_window=0):
FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
-GROUP BY p.normalized_url"""
+WHERE p.content_type NOT LIKE 'text/%'
+OR (p.content_type LIKE 'text/%' AND p.size <= ?)
+GROUP BY p.normalized_url""", constants.MAXIMUM_TEXT_PAGE_SIZE
)
for page in pages.iterator():
diff --git a/gus/constants.py b/gus/constants.py
@@ -5,6 +5,7 @@ STATISTICS_FILE = "statistics.csv"
DB_FILENAME = "gus.sqlite"
FEED_FILE = "feeds.txt"
MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
+MAXIMUM_TEXT_PAGE_SIZE = 100000 # 100KB, in bytes
# default change frequencies (in hours)
DEFAULT_ROOT_CHANGE_FREQUENCY = 3
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -302,7 +302,7 @@ def index_content(resource, response):
"port": resource.urlsplit.port or 1965,
"content_type": response.content_type,
"charset": response.charset,
- "content": response.content,
+ "content": response.content if response.num_bytes <= constants.MAXIMUM_TEXT_PAGE_SIZE else None,
"size": response.num_bytes,
"change_frequency": resource.get_default_change_frequency("content"),
}