@@ -83,8 +83,12 @@ def __init__(self, name, init_config, instances):
8383 raise ConfigurationError ('The cluster_name must be specified in the instance configuration' )
8484
8585 self .master_address = self ._get_master_address ()
86+ self ._connection_error_seen = False
87+ self ._debounced_this_run = False
8688
8789 def check (self , _ ):
90+ self ._debounced_this_run = False
91+
8892 tags = list (self .tags )
8993
9094 tags .append ('spark_cluster:%s' % self .cluster_name )
@@ -192,6 +196,8 @@ def _get_running_apps(self):
192196 def _collect_version (self , base_url , tags ):
193197 try :
194198 version_json = self ._rest_request_to_json (base_url , SPARK_VERSION_PATH , SPARK_SERVICE_CHECK , tags )
199+ if version_json is None :
200+ return False
195201 version = version_json ['spark' ]
196202 except Exception as e :
197203 self .log .debug ("Failed to collect version information: %s" , e )
@@ -206,10 +212,18 @@ def _driver_init(self, tags):
206212 """
207213 self ._collect_version (self .master_address , tags )
208214 running_apps = {}
215+
216+ # A request earlier in this check run already hit a debounced connection failure.
217+ # Skip the remaining driver queries so we only retry on the next scheduled run.
218+ if self ._debounced_this_run :
219+ return running_apps
209220 metrics_json = self ._rest_request_to_json (
210221 self .master_address , SPARK_APPS_PATH , SPARK_DRIVER_SERVICE_CHECK , tags
211222 )
212223
224+ if metrics_json is None :
225+ return running_apps
226+
213227 for app_json in metrics_json :
214228 app_id = app_json .get ('id' )
215229 app_name = app_json .get ('name' )
@@ -231,6 +245,9 @@ def _standalone_init(self, pre_20_mode, tags):
231245 self .master_address , SPARK_MASTER_STATE_PATH , SPARK_STANDALONE_SERVICE_CHECK , tags
232246 )
233247
248+ if metrics_json is None :
249+ return {}
250+
234251 running_apps = {}
235252 version_set = False
236253
@@ -251,10 +268,11 @@ def _standalone_init(self, pre_20_mode, tags):
251268 applist = self ._rest_request_to_json (
252269 app_url , SPARK_APPS_PATH , SPARK_STANDALONE_SERVICE_CHECK , tags
253270 )
254- for appl in applist :
255- aid = appl .get ('id' )
256- aname = appl .get ('name' )
257- running_apps [aid ] = (aname , app_url )
271+ if applist :
272+ for appl in applist :
273+ aid = appl .get ('id' )
274+ aname = appl .get ('name' )
275+ running_apps [aid ] = (aname , app_url )
258276 else :
259277 running_apps [app_id ] = (app_name , app_url )
260278 except Exception :
@@ -279,6 +297,9 @@ def _mesos_init(self, tags):
279297
280298 metrics_json = self ._rest_request_to_json (self .master_address , MESOS_MASTER_APP_PATH , MESOS_SERVICE_CHECK , tags )
281299
300+ if metrics_json is None :
301+ return running_apps
302+
282303 if metrics_json .get ('frameworks' ):
283304 for app_json in metrics_json .get ('frameworks' ):
284305 app_id = app_json .get ('id' )
@@ -330,6 +351,9 @@ def _get_standalone_app_url(self, app_id, tags):
330351 self .master_address , SPARK_MASTER_APP_PATH , SPARK_STANDALONE_SERVICE_CHECK , tags , appId = app_id
331352 )
332353
354+ if app_page is None :
355+ return None
356+
333357 dom = BeautifulSoup (app_page .text , 'html.parser' )
334358 app_detail_ui_links = dom .find_all ('a' , string = 'Application Detail UI' )
335359
@@ -352,6 +376,9 @@ def _yarn_get_running_spark_apps(self, tags):
352376 applicationTypes = YARN_APPLICATION_TYPES ,
353377 )
354378
379+ if metrics_json is None :
380+ return {}
381+
355382 running_apps = {}
356383
357384 if metrics_json .get ('apps' ):
@@ -379,6 +406,8 @@ def _get_spark_app_ids(self, running_apps, tags):
379406 if not version_set :
380407 version_set = self ._collect_version (tracking_url , tags )
381408 response = self ._rest_request_to_json (tracking_url , SPARK_APPS_PATH , SPARK_SERVICE_CHECK , tags )
409+ if response is None :
410+ continue
382411 except Exception as e :
383412 self .log .warning ("Exception happened when fetching app ids for %s: %s" , tracking_url , e )
384413 continue
@@ -405,6 +434,8 @@ def _describe_app(self, property, running_apps, addl_tags):
405434 response = self ._rest_request (
406435 base_url , SPARK_APPS_PATH , SPARK_SERVICE_CHECK , addl_tags , app_id , property
407436 )
437+ if response is None :
438+ continue
408439 except HTTPError :
409440 self .log .debug ("Got an error collecting %s" , property , exc_info = True )
410441 continue
@@ -512,6 +543,8 @@ def _spark_structured_streams_metrics(self, running_apps, addl_tags):
512543 response = self ._rest_request_to_json (
513544 base_url , self .metricsservlet_path , SPARK_SERVICE_CHECK , addl_tags
514545 )
546+ if response is None :
547+ continue
515548 self .log .debug ('Structured streaming metrics: %s' , response )
516549 response = {
517550 metric_name : v ['value' ]
@@ -611,6 +644,10 @@ def _rest_request(self, url, object_path, service_name, tags, *args, **kwargs):
611644 self .log .debug ('Spark check URL: %s' , url )
612645 response = self .http .get (url , cookies = self .proxy_redirect_cookies )
613646 response .raise_for_status ()
647+
648+ # Reset connection errors on success
649+ self ._connection_error_seen = False
650+
614651 content = response .text
615652 proxy_redirect_url = self ._parse_proxy_redirect_url (content )
616653 if proxy_redirect_url :
@@ -633,6 +670,9 @@ def _rest_request(self, url, object_path, service_name, tags, *args, **kwargs):
633670 raise
634671
635672 except (HTTPError , InvalidURL , ConnectionError ) as e :
673+ if isinstance (e , ConnectionError ) and self ._should_suppress_connection_error (e , tags ):
674+ return None
675+
636676 self .service_check (
637677 service_name ,
638678 AgentCheck .CRITICAL ,
@@ -654,6 +694,9 @@ def _rest_request_to_json(self, address, object_path, service_name, tags, *args,
654694 """
655695 response = self ._rest_request (address , object_path , service_name , tags , * args , ** kwargs )
656696
697+ if response is None :
698+ return None
699+
657700 try :
658701 response_json = response .json ()
659702
@@ -668,6 +711,42 @@ def _rest_request_to_json(self, address, object_path, service_name, tags, *args,
668711
669712 return response_json
670713
714+ def _should_suppress_connection_error (self , exception , tags ):
715+ """Suppress kubernetes-only connection false positives during pod shutdown."""
716+ pod_phase = self ._get_pod_phase (tags )
717+ if pod_phase is None :
718+ return False
719+
720+ if pod_phase in ('failed' , 'succeeded' , 'unknown' ):
721+ self .log .debug ("Pod phase is terminal, suppressing request error: %s" , exception )
722+ return True
723+
724+ if (
725+ not self ._connection_error_seen
726+ and not self ._debounced_this_run
727+ and ("Connection refused" in str (exception ) or "No route to host" in str (exception ))
728+ ):
729+ self ._connection_error_seen = True
730+ self ._debounced_this_run = True
731+ self .log .warning (
732+ "Connection failed. Suppressing error once to ensure driver is running. Error: %s" ,
733+ exception ,
734+ )
735+ return True
736+
737+ return False
738+
739+ def _is_pod_in_terminal_state (self , tags ):
740+ pod_phase = self ._get_pod_phase (tags )
741+ return pod_phase in ('failed' , 'succeeded' , 'unknown' ) if pod_phase is not None else False
742+
743+ @staticmethod
744+ def _get_pod_phase (tags ):
745+ for tag in tags or []:
746+ if tag .startswith ('pod_phase:' ):
747+ return tag .split (':' , 1 )[1 ].strip ().lower ()
748+ return None
749+
671750 @classmethod
672751 def _join_url_dir (cls , url , * args ):
673752 """
0 commit comments