geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 0b0b33610a729350a379c0ec09ee29fe04e589b4
parent 1dac97f01e9c7260cf2ef84ab4080a4835c86c63
Author: René Wagner <rwagner@rw-net.de>
Date:   Mon,  8 Mar 2021 19:21:29 +0100

skip a capsule after 5 consecutive failed requests

This state is reset after the current crawl

closes #16

Diffstat:
Mgus/constants.py | 1+
Mgus/crawl.py | 38+++++++++++++++++++++++++++-----------
2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/gus/constants.py b/gus/constants.py @@ -5,6 +5,7 @@ STATISTICS_FILE = "statistics.csv" DB_FILENAME = "gus.sqlite" FEED_FILE = "feeds.txt" MAXIMUM_REDIRECT_CHAIN_LENGTH = 5 +MAXIMUM_FAILED_REQUEST_COUNT = 5 MAXIMUM_TEXT_PAGE_SIZE = 100000 # 100KB, in bytes # default change frequencies (in hours) diff --git a/gus/crawl.py b/gus/crawl.py @@ -49,7 +49,10 @@ EXCLUDED_URL_PREFIXES = [ # serving big files and slooow capsule -> takes to long to crawl "gemini://kamalatta.ddnss.de/", - + + # Mastodon proxy + "gemini://mastogem.picasoft.net", + # ASCII art with emulated modem speed "gemini://ansi.hrtk.in/", "gemini://matrix.kiwifarms.net", @@ -58,6 +61,9 @@ EXCLUDED_URL_PREFIXES = [ "gemini://songs.zachdecook.com/song.gmi.php/", "gemini://songs.zachdecook.com/chord.svg/", + # kwiecien gemcast + "gemini://kwiecien.us/gemcast/", + # OmarPolos BSD ports 'gemini://gemini.omarpolo.com/cgi/gempkg', @@ -82,13 +88,6 @@ EXCLUDED_URL_PREFIXES = [ "gemini://gus.guru/backlinks?", "gemini://gus.guru/threads", - "gemini://geminispace.info/search/", - "gemini://geminispace.info/v/search/", - "gemini://geminispace.info/search?", - "gemini://geminispace.info/v/search?", - "gemini://geminispace.info/add-seed?", - "gemini://geminispace.info/backlinks?", - "gemini://geminispace.info/threads", # Houston "gemini://houston.coder.town/search?", "gemini://houston.coder.town/search/", @@ -131,6 +130,7 @@ EXCLUDED_URL_PREFIXES = [ "gemini://alexschroeder.ch/do/tags", "gemini://alexschroeder.ch/do/match", "gemini://alexschroeder.ch/do/search", + "gemini://alexschroeder.ch:1965/do/gallery/", # communitywiki's problematic stuff "gemini://communitywiki.org:1966/image_external", @@ -151,6 +151,12 @@ EXCLUDED_URL_PREFIXES = [ "gemini://communitywiki.org:1966/do/match", "gemini://communitywiki.org:1966/do/search", + # mozz mailing list linkscraper + "gemini://mozz.us/files/gemini-links.gmi", + + # gemini.techrights.org + "gemini://gemini.techrights.org/", + # youtube mirror "gemini://pon.ix.tc/cgi-bin/youtube.cgi?", @@ -452,6 +458,11 @@ def crawl_page( ): gr = gemini_resource url = gr.fetchable_url + if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT: + logging.warn( + "Too much failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url) + ) + return if max_crawl_depth >= 0 and current_depth > max_crawl_depth: logging.warn( "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url) @@ -541,8 +552,12 @@ def crawl_page( page=page, status=0, is_different=False, timestamp=datetime.utcnow() ) page_crawl.save() + failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1 + logging.debug("Failed request count for host %s is %d", gr.normalized_host, failure_count[gr.normalized_host]) + return - elif response.status.startswith("4"): + failure_count[gr.normalized_host] = 0 + if response.status.startswith("4"): # temporary error status logging.debug( "Got temporary error: %s: %s %s", @@ -828,15 +843,16 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): global domain_hit_timings domain_hit_timings = {} global max_crawl_depth - max_crawl_depth = 100 + max_crawl_depth = 500 + global failure_count + failure_count = {} expired_resources = [GeminiResource(url) for url in load_expired_urls()] for resource in expired_resources: crawl_page(resource, 0, should_check_if_expired=False) submitted_resources = [GeminiResource(url) for url in load_seed_request_urls()] for resource in submitted_resources: crawl_page(resource, 0, should_check_if_expired=True) - pickle_robot_file_map(robot_file_map, index_dir) logging.info("Finished!")