geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 7e5882ef498044f0949403427e3c3de41ebaba65
parent 1a1650dd7e619ef2102e5f445e1d079b06b689de
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 16 Nov 2020 07:49:37 -0500

[build_index] Only index text pages <= 1KB in size

Diffstat:
Mgus/build_index.py | 4+++-
Mgus/constants.py | 1+
Mgus/crawl.py | 2+-
3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -114,7 +114,9 @@ def build_index(should_run_destructive=False, invalidation_window=0): FROM indexable_crawl AS c JOIN page AS p ON p.id == c.page_id -GROUP BY p.normalized_url""" +WHERE p.content_type NOT LIKE 'text/%' +OR (p.content_type LIKE 'text/%' AND p.size <= ?) +GROUP BY p.normalized_url""", constants.MAXIMUM_TEXT_PAGE_SIZE ) for page in pages.iterator(): diff --git a/gus/constants.py b/gus/constants.py @@ -5,6 +5,7 @@ STATISTICS_FILE = "statistics.csv" DB_FILENAME = "gus.sqlite" FEED_FILE = "feeds.txt" MAXIMUM_REDIRECT_CHAIN_LENGTH = 5 +MAXIMUM_TEXT_PAGE_SIZE = 100000 # 100KB, in bytes # default change frequencies (in hours) DEFAULT_ROOT_CHANGE_FREQUENCY = 3 diff --git a/gus/crawl.py b/gus/crawl.py @@ -302,7 +302,7 @@ def index_content(resource, response): "port": resource.urlsplit.port or 1965, "content_type": response.content_type, "charset": response.charset, - "content": response.content, + "content": response.content if response.num_bytes <= constants.MAXIMUM_TEXT_PAGE_SIZE else None, "size": response.num_bytes, "change_frequency": resource.get_default_change_frequency("content"), }