Skip to content

Commit 95960be

Browse files
committed
Address comments
1 parent 8714121 commit 95960be

File tree

6 files changed

+42
-29
lines changed

6 files changed

+42
-29
lines changed

spark/assets/configuration/spec.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,16 @@ files:
137137
example: true
138138
- name: startup_wait_retries
139139
description: |
140-
How many times to retry when Spark returns a startup message instead of JSON.
141-
Set to -1 to disable, 0 to retry forever, or a positive number to limit retries.
140+
Controls how to handle "Spark is starting up" responses from the driver.
141+
When the Spark driver is not yet ready, it returns a non-JSON startup message
142+
instead of the expected JSON response.
143+
144+
Set to 0 or less to disable this feature and treat startup messages as errors immediately.
145+
Set to a positive integer to retry that many times before marking the check as failed.
142146
value:
143147
type: integer
144-
display_default: 0
145-
example: 0
148+
display_default: 3
149+
example: 3
146150
- template: instances/http
147151
overrides:
148152
auth_token.description: |

spark/changelog.d/22252.added

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
Handle Spark driver startup message gracefully instead of raising a JSON parse error.
1+
Add `startup_wait_retries` option to handle Spark driver startup messages gracefully.

spark/datadog_checks/spark/config_models/defaults.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def instance_spark_proxy_enabled():
101101

102102

103103
def instance_startup_wait_retries():
104-
return 0
104+
return 3
105105

106106

107107
def instance_streaming_metrics():

spark/datadog_checks/spark/data/conf.yaml.example

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,15 @@ instances:
155155
#
156156
# disable_spark_stage_metrics: true
157157

158-
## @param startup_wait_retries - integer - optional - default: 0
159-
## How many times to retry when Spark returns a startup message instead of JSON.
160-
## Set to -1 to disable, 0 to retry forever, or a positive number to limit retries.
158+
## @param startup_wait_retries - integer - optional - default: 3
159+
## Controls how to handle "Spark is starting up" responses from the driver.
160+
## When the Spark driver is not yet ready, it returns a non-JSON startup message
161+
## instead of the expected JSON response.
162+
##
163+
## Set to 0 or less to disable this feature and treat startup messages as errors immediately.
164+
## Set to a positive integer to retry that many times before marking the check as failed.
161165
#
162-
# startup_wait_retries: 0
166+
# startup_wait_retries: 3
163167

164168
## @param proxy - mapping - optional
165169
## This overrides the `proxy` setting in `init_config`.

spark/datadog_checks/spark/spark.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,8 @@ def __init__(self, name, init_config, instances):
8888

8989
# Startup retry configuration:
9090
# -1: disable (treat startup messages as JSON parse errors immediately)
91-
# 0: retry forever (always skip startup messages)
9291
# >0: retry N times before marking as broken
93-
self._startup_wait_retries = int(self.instance.get('startup_wait_retries', 0))
92+
self._startup_wait_retries = int(self.instance.get('startup_wait_retries', 3))
9493
self._startup_retry_count = 0
9594

9695
def check(self, _):
@@ -711,15 +710,7 @@ def _rest_request_to_json(self, address, object_path, service_name, tags, *args,
711710
response_text = response.text.strip()
712711
if response_text and 'spark is starting up' in response_text.lower():
713712
# Handle startup message based on retry configuration
714-
if self._startup_wait_retries == -1:
715-
# Disabled: treat as error immediately
716-
pass
717-
elif self._startup_wait_retries == 0:
718-
# Retry forever
719-
self.log.debug("Spark driver not ready yet at %s: %s", self._get_url_base(address), response_text)
720-
return None
721-
else:
722-
# Retry N times before marking as broken
713+
if self._startup_wait_retries > 0:
723714
self._startup_retry_count += 1
724715
if self._startup_retry_count <= self._startup_wait_retries:
725716
self.log.debug(

spark/tests/test_spark.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,30 +1188,44 @@ def test_do_not_crash_on_version_collection_failure():
11881188

11891189

11901190
@pytest.mark.unit
1191-
def test_driver_startup_message_retry_forever(caplog):
1192-
"""Default behavior (startup_wait_retries=0): retry forever, never raise."""
1191+
def test_driver_startup_message_default_retries(aggregator, caplog):
1192+
"""Default behavior (startup_wait_retries=3): retry 3 times then raise."""
1193+
from simplejson import JSONDecodeError
1194+
11931195
check = SparkCheck('spark', {}, [DRIVER_CONFIG])
11941196
response = MockResponse(content="Spark is starting up. Please wait a while until it's ready.")
11951197

11961198
with caplog.at_level(logging.DEBUG):
11971199
with mock.patch.object(check, '_rest_request', return_value=response):
1198-
# Call multiple times - should always return None
1199-
for _ in range(5):
1200+
# First 3 attempts should return None (default is 3 retries)
1201+
for i in range(3):
12001202
result = check._rest_request_to_json(
12011203
DRIVER_CONFIG['spark_url'], SPARK_REST_PATH, SPARK_DRIVER_SERVICE_CHECK, []
12021204
)
1203-
assert result is None
1205+
assert result is None, f"Attempt {i + 1} should return None"
1206+
1207+
# 4th attempt should raise
1208+
with pytest.raises(JSONDecodeError):
1209+
check._rest_request_to_json(DRIVER_CONFIG['spark_url'], SPARK_REST_PATH, SPARK_DRIVER_SERVICE_CHECK, [])
12041210

12051211
assert 'spark driver not ready yet' in caplog.text.lower()
1212+
assert 'retries exhausted' in caplog.text.lower()
1213+
1214+
aggregator.assert_service_check(
1215+
SPARK_DRIVER_SERVICE_CHECK,
1216+
status=SparkCheck.CRITICAL,
1217+
tags=['url:{}'.format(DRIVER_CONFIG['spark_url'])],
1218+
)
12061219

12071220

12081221
@pytest.mark.unit
1209-
def test_driver_startup_message_disabled(aggregator):
1210-
"""When startup_wait_retries=-1, treat startup messages as errors immediately."""
1222+
@pytest.mark.parametrize("retries_value", [0, -1, -5])
1223+
def test_driver_startup_message_disabled(aggregator, retries_value):
1224+
"""When startup_wait_retries<=0, treat startup messages as errors immediately."""
12111225
from simplejson import JSONDecodeError
12121226

12131227
config = DRIVER_CONFIG.copy()
1214-
config['startup_wait_retries'] = -1
1228+
config['startup_wait_retries'] = retries_value
12151229
check = SparkCheck('spark', {}, [config])
12161230
response = MockResponse(content="Spark is starting up. Please wait a while until it's ready.")
12171231

0 commit comments

Comments
 (0)