From 60538a265681cc4cd1288cdc967f2b0ab9aecbe2 Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy Date: Mon, 28 Jul 2025 11:30:55 +0200 Subject: [PATCH 1/4] fix: Spark-connect test timing issues --- .../kuttl/spark-connect/10-assert.yaml | 19 +------------------ .../kuttl/spark-connect/11-assert.yaml | 14 ++++++++++++++ .../kuttl/spark-connect/20-assert.yaml | 2 +- .../20-run-connect-client.yaml.j2 | 2 +- 4 files changed, 17 insertions(+), 20 deletions(-) create mode 100644 tests/templates/kuttl/spark-connect/11-assert.yaml diff --git a/tests/templates/kuttl/spark-connect/10-assert.yaml b/tests/templates/kuttl/spark-connect/10-assert.yaml index 15886034..6e2bd2a2 100644 --- a/tests/templates/kuttl/spark-connect/10-assert.yaml +++ b/tests/templates/kuttl/spark-connect/10-assert.yaml @@ -1,7 +1,7 @@ --- apiVersion: kuttl.dev/v1beta1 kind: TestAssert -timeout: 300 +timeout: 600 --- apiVersion: apps/v1 kind: StatefulSet @@ -23,20 +23,3 @@ metadata: name: spark-connect-server-headless spec: type: ClusterIP ---- -apiVersion: kuttl.dev/v1beta1 -kind: TestAssert -timeout: 300 -commands: - # Test that spark connect executors are running. - # Sleep to prevent the following spark connect app from failing - # while the spark-connect server is busy setting up the executors. - - script: | - # wait for the spark-connect CR to become available - kubectl wait --for=condition=Available sparkconnectservers/spark-connect --namespace "$NAMESPACE" --timeout=3m - - EXECUTOR_COUNT=$(kubectl get pods -n "$NAMESPACE" --selector 'spark-app-name=spark-connect-server' --field-selector='status.phase=Running' -o NAME|wc -l) - test 1 -eq "$EXECUTOR_COUNT" - - # wait a little longer to increase the chance apps being able to connect - sleep 5 diff --git a/tests/templates/kuttl/spark-connect/11-assert.yaml b/tests/templates/kuttl/spark-connect/11-assert.yaml new file mode 100644 index 00000000..395360e5 --- /dev/null +++ b/tests/templates/kuttl/spark-connect/11-assert.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 600 +commands: + # Test that spark connect executors are running. + # Sleep to prevent the following spark connect app from failing + # while the spark-connect server is busy setting up the executors. + - script: | + # wait for the spark-connect CR to become available + kubectl wait --for=condition=Ready pod -l spark-app-name=spark-connect-server -n "$NAMESPACE" --timeout=10m + + # wait a little longer to increase the chance apps being able to connect + sleep 10 diff --git a/tests/templates/kuttl/spark-connect/20-assert.yaml b/tests/templates/kuttl/spark-connect/20-assert.yaml index 930e3383..c403a5a9 100644 --- a/tests/templates/kuttl/spark-connect/20-assert.yaml +++ b/tests/templates/kuttl/spark-connect/20-assert.yaml @@ -3,7 +3,7 @@ apiVersion: kuttl.dev/v1beta1 kind: TestAssert metadata: name: simple-connect-app -timeout: 300 +timeout: 600 --- apiVersion: batch/v1 kind: Job diff --git a/tests/templates/kuttl/spark-connect/20-run-connect-client.yaml.j2 b/tests/templates/kuttl/spark-connect/20-run-connect-client.yaml.j2 index e6950b5a..85aff8e7 100644 --- a/tests/templates/kuttl/spark-connect/20-run-connect-client.yaml.j2 +++ b/tests/templates/kuttl/spark-connect/20-run-connect-client.yaml.j2 @@ -54,7 +54,7 @@ spec: template: spec: restartPolicy: OnFailure - activeDeadlineSeconds: 100 + activeDeadlineSeconds: 600 containers: - name: simple-connect-app {% if test_scenario['values']['spark-connect-client'].find(",") > 0 %} From 9ce56192f33dee605623fd55d2188c401f4fcad9 Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy <1712947+adwk67@users.noreply.github.com> Date: Mon, 28 Jul 2025 14:44:47 +0200 Subject: [PATCH 2/4] Update tests/templates/kuttl/spark-connect/20-assert.yaml Co-authored-by: Sebastian Bernauer --- tests/templates/kuttl/spark-connect/20-assert.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/templates/kuttl/spark-connect/20-assert.yaml b/tests/templates/kuttl/spark-connect/20-assert.yaml index c403a5a9..c37bc28e 100644 --- a/tests/templates/kuttl/spark-connect/20-assert.yaml +++ b/tests/templates/kuttl/spark-connect/20-assert.yaml @@ -1,8 +1,6 @@ --- apiVersion: kuttl.dev/v1beta1 kind: TestAssert -metadata: - name: simple-connect-app timeout: 600 --- apiVersion: batch/v1 From f58a52461a6d19b8aedd5a6ac127ed59505b5f12 Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy <1712947+adwk67@users.noreply.github.com> Date: Mon, 28 Jul 2025 14:48:45 +0200 Subject: [PATCH 3/4] Update tests/templates/kuttl/spark-connect/11-assert.yaml I left out the sparkconnectservers check as we check the statefulset, and that waits for the driver. We can keep this check in, but, just for my understanding, do we *need* both? Co-authored-by: Sebastian Bernauer --- tests/templates/kuttl/spark-connect/11-assert.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/templates/kuttl/spark-connect/11-assert.yaml b/tests/templates/kuttl/spark-connect/11-assert.yaml index 395360e5..2a0f4043 100644 --- a/tests/templates/kuttl/spark-connect/11-assert.yaml +++ b/tests/templates/kuttl/spark-connect/11-assert.yaml @@ -8,6 +8,10 @@ commands: # while the spark-connect server is busy setting up the executors. - script: | # wait for the spark-connect CR to become available + kubectl wait --for=condition=Available sparkconnectservers/spark-connect --namespace "$NAMESPACE" --timeout=3m + + # FIXME: As the status currently does not respect the executors state, we wait for them to be ready ourselves + # (see TODO comment in code): kubectl wait --for=condition=Ready pod -l spark-app-name=spark-connect-server -n "$NAMESPACE" --timeout=10m # wait a little longer to increase the chance apps being able to connect From ecdb51bed2cb2365ef69903c553a73ff18e54fdc Mon Sep 17 00:00:00 2001 From: Andrew Kenworthy Date: Mon, 28 Jul 2025 14:59:04 +0200 Subject: [PATCH 4/4] added comment about checking executors as well as driver --- rust/operator-binary/src/connect/controller.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rust/operator-binary/src/connect/controller.rs b/rust/operator-binary/src/connect/controller.rs index 345bac6d..5544fb39 100644 --- a/rust/operator-binary/src/connect/controller.rs +++ b/rust/operator-binary/src/connect/controller.rs @@ -315,6 +315,12 @@ pub async fn reconcile( let cluster_operation_cond_builder = ClusterOperationsConditionBuilder::new(&scs.spec.cluster_operation); + // TODO: This StatefulSet only contains the driver. We should probably also + // consider the state of the executors to determine if the + // SparkConnectServer is ready. This depends on the availability and + // resilience properties of Spark and could e.g. be "driver and more than + // 75% of the executors ready". Special care needs to be taken about + // auto-scaling executors in this case (if/once supported). let status = SparkConnectServerStatus { conditions: compute_conditions(scs, &[&ss_cond_builder, &cluster_operation_cond_builder]), };