Skip to content

Commit 0de1249

Browse files
adwk67sbernauer
andauthored
fix: Spark-connect test timing issues (#597)
* fix: Spark-connect test timing issues * Update tests/templates/kuttl/spark-connect/20-assert.yaml Co-authored-by: Sebastian Bernauer <[email protected]> * Update tests/templates/kuttl/spark-connect/11-assert.yaml I left out the sparkconnectservers check as we check the statefulset, and that waits for the driver. We can keep this check in, but, just for my understanding, do we *need* both? Co-authored-by: Sebastian Bernauer <[email protected]> * added comment about checking executors as well as driver --------- Co-authored-by: Sebastian Bernauer <[email protected]>
1 parent f8be252 commit 0de1249

File tree

5 files changed

+27
-22
lines changed

5 files changed

+27
-22
lines changed

rust/operator-binary/src/connect/controller.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,12 @@ pub async fn reconcile(
315315
let cluster_operation_cond_builder =
316316
ClusterOperationsConditionBuilder::new(&scs.spec.cluster_operation);
317317

318+
// TODO: This StatefulSet only contains the driver. We should probably also
319+
// consider the state of the executors to determine if the
320+
// SparkConnectServer is ready. This depends on the availability and
321+
// resilience properties of Spark and could e.g. be "driver and more than
322+
// 75% of the executors ready". Special care needs to be taken about
323+
// auto-scaling executors in this case (if/once supported).
318324
let status = SparkConnectServerStatus {
319325
conditions: compute_conditions(scs, &[&ss_cond_builder, &cluster_operation_cond_builder]),
320326
};
Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
apiVersion: kuttl.dev/v1beta1
33
kind: TestAssert
4-
timeout: 300
4+
timeout: 600
55
---
66
apiVersion: apps/v1
77
kind: StatefulSet
@@ -23,20 +23,3 @@ metadata:
2323
name: spark-connect-server-headless
2424
spec:
2525
type: ClusterIP
26-
---
27-
apiVersion: kuttl.dev/v1beta1
28-
kind: TestAssert
29-
timeout: 300
30-
commands:
31-
# Test that spark connect executors are running.
32-
# Sleep to prevent the following spark connect app from failing
33-
# while the spark-connect server is busy setting up the executors.
34-
- script: |
35-
# wait for the spark-connect CR to become available
36-
kubectl wait --for=condition=Available sparkconnectservers/spark-connect --namespace "$NAMESPACE" --timeout=3m
37-
38-
EXECUTOR_COUNT=$(kubectl get pods -n "$NAMESPACE" --selector 'spark-app-name=spark-connect-server' --field-selector='status.phase=Running' -o NAME|wc -l)
39-
test 1 -eq "$EXECUTOR_COUNT"
40-
41-
# wait a little longer to increase the chance apps being able to connect
42-
sleep 5
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestAssert
4+
timeout: 600
5+
commands:
6+
# Test that spark connect executors are running.
7+
# Sleep to prevent the following spark connect app from failing
8+
# while the spark-connect server is busy setting up the executors.
9+
- script: |
10+
# wait for the spark-connect CR to become available
11+
kubectl wait --for=condition=Available sparkconnectservers/spark-connect --namespace "$NAMESPACE" --timeout=3m
12+
13+
# FIXME: As the status currently does not respect the executors state, we wait for them to be ready ourselves
14+
# (see TODO comment in code):
15+
kubectl wait --for=condition=Ready pod -l spark-app-name=spark-connect-server -n "$NAMESPACE" --timeout=10m
16+
17+
# wait a little longer to increase the chance apps being able to connect
18+
sleep 10

tests/templates/kuttl/spark-connect/20-assert.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
---
22
apiVersion: kuttl.dev/v1beta1
33
kind: TestAssert
4-
metadata:
5-
name: simple-connect-app
6-
timeout: 300
4+
timeout: 600
75
---
86
apiVersion: batch/v1
97
kind: Job

tests/templates/kuttl/spark-connect/20-run-connect-client.yaml.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ spec:
5454
template:
5555
spec:
5656
restartPolicy: OnFailure
57-
activeDeadlineSeconds: 100
57+
activeDeadlineSeconds: 600
5858
containers:
5959
- name: simple-connect-app
6060
{% if test_scenario['values']['spark-connect-client'].find(",") > 0 %}

0 commit comments

Comments
 (0)