@@ -1187,6 +1187,124 @@ def test_do_not_crash_on_version_collection_failure():
11871187 assert not c ._collect_version (running_apps , [])
11881188
11891189
1190+ @pytest .mark .unit
1191+ def test_driver_startup_message_default_retries (aggregator , caplog ):
1192+ """Default behavior (startup_wait_retries=3): retry 3 times then raise."""
1193+ from simplejson import JSONDecodeError
1194+
1195+ check = SparkCheck ('spark' , {}, [DRIVER_CONFIG ])
1196+ response = MockResponse (content = "Spark is starting up. Please wait a while until it's ready." )
1197+
1198+ with caplog .at_level (logging .DEBUG ):
1199+ with mock .patch .object (check , '_rest_request' , return_value = response ):
1200+ # First 3 attempts should return None (default is 3 retries)
1201+ for i in range (3 ):
1202+ result = check ._rest_request_to_json (
1203+ DRIVER_CONFIG ['spark_url' ], SPARK_REST_PATH , SPARK_DRIVER_SERVICE_CHECK , []
1204+ )
1205+ assert result is None , f"Attempt { i + 1 } should return None"
1206+
1207+ # 4th attempt should raise
1208+ with pytest .raises (JSONDecodeError ):
1209+ check ._rest_request_to_json (DRIVER_CONFIG ['spark_url' ], SPARK_REST_PATH , SPARK_DRIVER_SERVICE_CHECK , [])
1210+
1211+ assert 'spark driver not ready yet' in caplog .text .lower ()
1212+ assert 'retries exhausted' in caplog .text .lower ()
1213+
1214+ aggregator .assert_service_check (
1215+ SPARK_DRIVER_SERVICE_CHECK ,
1216+ status = SparkCheck .CRITICAL ,
1217+ tags = ['url:{}' .format (DRIVER_CONFIG ['spark_url' ])],
1218+ )
1219+
1220+
1221+ @pytest .mark .unit
1222+ @pytest .mark .parametrize ("retries_value" , [0 , - 1 , - 5 ])
1223+ def test_driver_startup_message_disabled (aggregator , retries_value ):
1224+ """When startup_wait_retries<=0, treat startup messages as errors immediately."""
1225+ from simplejson import JSONDecodeError
1226+
1227+ config = DRIVER_CONFIG .copy ()
1228+ config ['startup_wait_retries' ] = retries_value
1229+ check = SparkCheck ('spark' , {}, [config ])
1230+ response = MockResponse (content = "Spark is starting up. Please wait a while until it's ready." )
1231+
1232+ with mock .patch .object (check , '_rest_request' , return_value = response ):
1233+ with pytest .raises (JSONDecodeError ):
1234+ check ._rest_request_to_json (config ['spark_url' ], SPARK_REST_PATH , SPARK_DRIVER_SERVICE_CHECK , [])
1235+
1236+ aggregator .assert_service_check (
1237+ SPARK_DRIVER_SERVICE_CHECK ,
1238+ status = SparkCheck .CRITICAL ,
1239+ tags = ['url:{}' .format (config ['spark_url' ])],
1240+ )
1241+
1242+
1243+ @pytest .mark .unit
1244+ def test_driver_startup_message_limited_retries (aggregator , caplog ):
1245+ """When startup_wait_retries>0, retry N times then raise."""
1246+ from simplejson import JSONDecodeError
1247+
1248+ config = DRIVER_CONFIG .copy ()
1249+ config ['startup_wait_retries' ] = 3
1250+ check = SparkCheck ('spark' , {}, [config ])
1251+ response = MockResponse (content = "Spark is starting up. Please wait a while until it's ready." )
1252+
1253+ with caplog .at_level (logging .DEBUG ):
1254+ with mock .patch .object (check , '_rest_request' , return_value = response ):
1255+ # First 3 attempts should return None
1256+ for i in range (3 ):
1257+ result = check ._rest_request_to_json (
1258+ config ['spark_url' ], SPARK_REST_PATH , SPARK_DRIVER_SERVICE_CHECK , []
1259+ )
1260+ assert result is None , f"Attempt { i + 1 } should return None"
1261+
1262+ # 4th attempt should raise
1263+ with pytest .raises (JSONDecodeError ):
1264+ check ._rest_request_to_json (config ['spark_url' ], SPARK_REST_PATH , SPARK_DRIVER_SERVICE_CHECK , [])
1265+
1266+ assert 'attempt 1/3' in caplog .text .lower ()
1267+ assert 'attempt 3/3' in caplog .text .lower ()
1268+ assert 'retries exhausted' in caplog .text .lower ()
1269+
1270+ aggregator .assert_service_check (
1271+ SPARK_DRIVER_SERVICE_CHECK ,
1272+ status = SparkCheck .CRITICAL ,
1273+ tags = ['url:{}' .format (config ['spark_url' ])],
1274+ )
1275+
1276+
1277+ @pytest .mark .unit
1278+ def test_driver_startup_retry_counter_resets_on_success (caplog ):
1279+ """Verify the retry counter resets after a successful JSON response."""
1280+ config = DRIVER_CONFIG .copy ()
1281+ config ['startup_wait_retries' ] = 2
1282+ check = SparkCheck ('spark' , {}, [config ])
1283+ startup_response = MockResponse (content = "Spark is starting up. Please wait a while until it's ready." )
1284+ success_response = MockResponse (json_data = [{"id" : "app_001" , "name" : "TestApp" }])
1285+
1286+ with caplog .at_level (logging .DEBUG ):
1287+ with mock .patch .object (check , '_rest_request' , return_value = startup_response ):
1288+ # Use 1 retry
1289+ result = check ._rest_request_to_json (config ['spark_url' ], SPARK_REST_PATH , SPARK_DRIVER_SERVICE_CHECK , [])
1290+ assert result is None
1291+ assert check ._startup_retry_count == 1
1292+
1293+ # Successful response resets counter
1294+ with mock .patch .object (check , '_rest_request' , return_value = success_response ):
1295+ result = check ._rest_request_to_json (config ['spark_url' ], SPARK_REST_PATH , SPARK_DRIVER_SERVICE_CHECK , [])
1296+ assert result == [{"id" : "app_001" , "name" : "TestApp" }]
1297+ assert check ._startup_retry_count == 0
1298+
1299+ # After reset, we should have 2 retries available again
1300+ with mock .patch .object (check , '_rest_request' , return_value = startup_response ):
1301+ for _ in range (2 ):
1302+ result = check ._rest_request_to_json (
1303+ config ['spark_url' ], SPARK_REST_PATH , SPARK_DRIVER_SERVICE_CHECK , []
1304+ )
1305+ assert result is None
1306+
1307+
11901308@pytest .mark .unit
11911309def test_ssl (dd_run_check ):
11921310 run_ssl_server ()
0 commit comments