commit 0b0b33610a729350a379c0ec09ee29fe04e589b4
parent 1dac97f01e9c7260cf2ef84ab4080a4835c86c63
Author: René Wagner <rwagner@rw-net.de>
Date: Mon, 8 Mar 2021 19:21:29 +0100
skip a capsule after 5 consecutive failed requests
This state is reset after the current crawl
closes #16
Diffstat:
2 files changed, 28 insertions(+), 11 deletions(-)
diff --git a/gus/constants.py b/gus/constants.py
@@ -5,6 +5,7 @@ STATISTICS_FILE = "statistics.csv"
DB_FILENAME = "gus.sqlite"
FEED_FILE = "feeds.txt"
MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
+MAXIMUM_FAILED_REQUEST_COUNT = 5
MAXIMUM_TEXT_PAGE_SIZE = 100000 # 100KB, in bytes
# default change frequencies (in hours)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -49,7 +49,10 @@ EXCLUDED_URL_PREFIXES = [
# serving big files and slooow capsule -> takes to long to crawl
"gemini://kamalatta.ddnss.de/",
-
+
+ # Mastodon proxy
+ "gemini://mastogem.picasoft.net",
+
# ASCII art with emulated modem speed
"gemini://ansi.hrtk.in/",
"gemini://matrix.kiwifarms.net",
@@ -58,6 +61,9 @@ EXCLUDED_URL_PREFIXES = [
"gemini://songs.zachdecook.com/song.gmi.php/",
"gemini://songs.zachdecook.com/chord.svg/",
+ # kwiecien gemcast
+ "gemini://kwiecien.us/gemcast/",
+
# OmarPolos BSD ports
'gemini://gemini.omarpolo.com/cgi/gempkg',
@@ -82,13 +88,6 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gus.guru/backlinks?",
"gemini://gus.guru/threads",
- "gemini://geminispace.info/search/",
- "gemini://geminispace.info/v/search/",
- "gemini://geminispace.info/search?",
- "gemini://geminispace.info/v/search?",
- "gemini://geminispace.info/add-seed?",
- "gemini://geminispace.info/backlinks?",
- "gemini://geminispace.info/threads",
# Houston
"gemini://houston.coder.town/search?",
"gemini://houston.coder.town/search/",
@@ -131,6 +130,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://alexschroeder.ch/do/tags",
"gemini://alexschroeder.ch/do/match",
"gemini://alexschroeder.ch/do/search",
+ "gemini://alexschroeder.ch:1965/do/gallery/",
# communitywiki's problematic stuff
"gemini://communitywiki.org:1966/image_external",
@@ -151,6 +151,12 @@ EXCLUDED_URL_PREFIXES = [
"gemini://communitywiki.org:1966/do/match",
"gemini://communitywiki.org:1966/do/search",
+ # mozz mailing list linkscraper
+ "gemini://mozz.us/files/gemini-links.gmi",
+
+ # gemini.techrights.org
+ "gemini://gemini.techrights.org/",
+
# youtube mirror
"gemini://pon.ix.tc/cgi-bin/youtube.cgi?",
@@ -452,6 +458,11 @@ def crawl_page(
):
gr = gemini_resource
url = gr.fetchable_url
+ if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
+ logging.warn(
+ "Too much failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url)
+ )
+ return
if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
logging.warn(
"Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
@@ -541,8 +552,12 @@ def crawl_page(
page=page, status=0, is_different=False, timestamp=datetime.utcnow()
)
page_crawl.save()
+ failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1
+ logging.debug("Failed request count for host %s is %d", gr.normalized_host, failure_count[gr.normalized_host])
+ return
- elif response.status.startswith("4"):
+ failure_count[gr.normalized_host] = 0
+ if response.status.startswith("4"):
# temporary error status
logging.debug(
"Got temporary error: %s: %s %s",
@@ -828,15 +843,16 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
global domain_hit_timings
domain_hit_timings = {}
global max_crawl_depth
- max_crawl_depth = 100
+ max_crawl_depth = 500
+ global failure_count
+ failure_count = {}
expired_resources = [GeminiResource(url) for url in load_expired_urls()]
for resource in expired_resources:
crawl_page(resource, 0, should_check_if_expired=False)
submitted_resources = [GeminiResource(url) for url in load_seed_request_urls()]
for resource in submitted_resources:
crawl_page(resource, 0, should_check_if_expired=True)
-
pickle_robot_file_map(robot_file_map, index_dir)
logging.info("Finished!")