Skip to content

Commit 017e58f

Browse files
authored
[Feature] Support configurable RayCluster deletion delay in RayService (#3864)
Signed-off-by: machichima <[email protected]>
1 parent db42cc5 commit 017e58f

File tree

10 files changed

+135
-2
lines changed

10 files changed

+135
-2
lines changed

docs/reference/api.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ _Appears in:_
294294

295295
| Field | Description | Default | Validation |
296296
| --- | --- | --- | --- |
297+
| `rayClusterDeletionDelaySeconds` _integer_ | RayClusterDeletionDelaySeconds specifies the delay, in seconds, before deleting old RayClusters.<br />The default value is 60 seconds. | | Minimum: 0 <br /> |
297298
| `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
298299
| `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
299300
| `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | |

helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/apis/ray/v1/rayservice_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ type RayServiceUpgradeStrategy struct {
6565

6666
// RayServiceSpec defines the desired state of RayService
6767
type RayServiceSpec struct {
68+
// RayClusterDeletionDelaySeconds specifies the delay, in seconds, before deleting old RayClusters.
69+
// The default value is 60 seconds.
70+
// +kubebuilder:validation:Minimum=0
71+
// +optional
72+
RayClusterDeletionDelaySeconds *int32 `json:"rayClusterDeletionDelaySeconds,omitempty"`
6873
// Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685
6974
// +optional
7075
ServiceUnhealthySecondThreshold *int32 `json:"serviceUnhealthySecondThreshold,omitempty"`

ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/config/crd/bases/ray.io_rayservices.yaml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/controllers/ray/rayservice_controller.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -471,16 +471,22 @@ func (r *RayServiceReconciler) cleanUpRayClusterInstance(ctx context.Context, ra
471471
return err
472472
}
473473

474-
// Clean up RayCluster instances. Each instance is deleted 60 seconds
474+
// Determine the ray cluster deletion delay seconds
475+
deletionDelay := RayClusterDeletionDelayDuration
476+
if rayServiceInstance.Spec.RayClusterDeletionDelaySeconds != nil {
477+
deletionDelay = time.Duration(*rayServiceInstance.Spec.RayClusterDeletionDelaySeconds) * time.Second
478+
}
479+
// Clean up RayCluster instances. Each instance is deleted after the configured deletion delay.
475480
for _, rayClusterInstance := range rayClusterList.Items {
476481
if rayClusterInstance.Name != rayServiceInstance.Status.ActiveServiceStatus.RayClusterName && rayClusterInstance.Name != rayServiceInstance.Status.PendingServiceStatus.RayClusterName {
477482
cachedTimestamp, exists := r.RayClusterDeletionTimestamps.Get(rayClusterInstance.Name)
478483
if !exists {
479-
deletionTimestamp := metav1.Now().Add(RayClusterDeletionDelayDuration)
484+
deletionTimestamp := metav1.Now().Add(deletionDelay)
480485
r.RayClusterDeletionTimestamps.Set(rayClusterInstance.Name, deletionTimestamp)
481486
logger.Info(
482487
"Scheduled dangling RayCluster for deletion",
483488
"rayClusterName", rayClusterInstance.Name,
489+
"deletionDelay", deletionDelay.String(),
484490
"deletionTimestamp", deletionTimestamp,
485491
)
486492
} else {

ray-operator/controllers/ray/rayservice_controller_unit_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ import (
77
"reflect"
88
"strconv"
99
"testing"
10+
"time"
1011

12+
cmap "github.com/orcaman/concurrent-map/v2"
1113
"github.com/stretchr/testify/assert"
1214
"github.com/stretchr/testify/require"
1315
corev1 "k8s.io/api/core/v1"
@@ -1232,3 +1234,86 @@ func TestIsZeroDowntimeUpgradeEnabled(t *testing.T) {
12321234
})
12331235
}
12341236
}
1237+
1238+
func TestRayClusterDeletionDelaySeconds(t *testing.T) {
1239+
namespace := "test-namespace"
1240+
rayClusterName := "test-cluster"
1241+
rayServiceName := "test-rayservice"
1242+
1243+
// Helper to create a RayService with optional RayClusterDeletionDelaySeconds
1244+
createRayService := func(delaySeconds *int32) *rayv1.RayService {
1245+
return &rayv1.RayService{
1246+
ObjectMeta: metav1.ObjectMeta{
1247+
Name: rayServiceName,
1248+
Namespace: namespace,
1249+
},
1250+
Spec: rayv1.RayServiceSpec{
1251+
RayClusterDeletionDelaySeconds: delaySeconds,
1252+
},
1253+
}
1254+
}
1255+
1256+
rayCluster := rayv1.RayCluster{
1257+
ObjectMeta: metav1.ObjectMeta{
1258+
Name: rayClusterName,
1259+
Namespace: namespace,
1260+
Labels: map[string]string{
1261+
utils.RayOriginatedFromCRNameLabelKey: rayServiceName,
1262+
utils.RayOriginatedFromCRDLabelKey: utils.RayOriginatedFromCRDLabelValue(utils.RayServiceCRD),
1263+
},
1264+
},
1265+
}
1266+
1267+
tests := []struct {
1268+
delaySeconds *int32
1269+
name string
1270+
expectedDuration time.Duration
1271+
}{
1272+
{
1273+
name: "Use default delay when not set",
1274+
delaySeconds: nil,
1275+
expectedDuration: RayClusterDeletionDelayDuration,
1276+
},
1277+
{
1278+
name: "Use custom delay when set to 0",
1279+
delaySeconds: ptr.To[int32](0),
1280+
expectedDuration: 0 * time.Second,
1281+
},
1282+
{
1283+
name: "Use custom delay when set to positive",
1284+
delaySeconds: ptr.To[int32](5),
1285+
expectedDuration: 5 * time.Second,
1286+
},
1287+
}
1288+
1289+
for _, tc := range tests {
1290+
t.Run(tc.name, func(t *testing.T) {
1291+
newScheme := runtime.NewScheme()
1292+
_ = rayv1.AddToScheme(newScheme)
1293+
ctx := context.TODO()
1294+
1295+
rayService := createRayService(tc.delaySeconds)
1296+
1297+
// Initialize a fake client with newScheme and runtimeObjects.
1298+
runtimeObjects := []runtime.Object{rayService, &rayCluster}
1299+
fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build()
1300+
r := RayServiceReconciler{
1301+
Client: fakeClient,
1302+
Scheme: newScheme,
1303+
Recorder: record.NewFakeRecorder(1),
1304+
RayClusterDeletionTimestamps: cmap.New[time.Time](),
1305+
}
1306+
1307+
now := time.Now()
1308+
err := r.cleanUpRayClusterInstance(ctx, rayService)
1309+
require.NoError(t, err)
1310+
1311+
// Check that the deletion timestamp is set and equals to the expected value
1312+
ts, exists := r.RayClusterDeletionTimestamps.Get(rayClusterName)
1313+
assert.True(t, exists, "Deletion timestamp should be set for the cluster")
1314+
expectedTs := now.Add(tc.expectedDuration)
1315+
1316+
assert.InDelta(t, expectedTs.Unix(), ts.Unix(), 1, "Deletion timestamp should be within 1 second of expected timestamp")
1317+
})
1318+
}
1319+
}

ray-operator/controllers/ray/utils/validation.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,5 +256,11 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error {
256256
*rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster {
257257
return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.NewCluster, rayv1.None)
258258
}
259+
260+
if rayService.Spec.RayClusterDeletionDelaySeconds != nil &&
261+
*rayService.Spec.RayClusterDeletionDelaySeconds < 0 {
262+
return fmt.Errorf("Spec.RayClusterDeletionDelaySeconds should be a non-negative integer, got %d", *rayService.Spec.RayClusterDeletionDelaySeconds)
263+
}
264+
259265
return nil
260266
}

ray-operator/controllers/ray/utils/validation_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,6 +1084,14 @@ func TestValidateRayServiceSpec(t *testing.T) {
10841084
spec: rayv1.RayServiceSpec{},
10851085
expectError: true,
10861086
},
1087+
{
1088+
name: "Spec.RayClusterDeletionDelaySeconds is negative",
1089+
spec: rayv1.RayServiceSpec{
1090+
RayClusterSpec: *createBasicRayClusterSpec(),
1091+
RayClusterDeletionDelaySeconds: ptr.To[int32](-1),
1092+
},
1093+
expectError: true,
1094+
},
10871095
}
10881096

10891097
for _, tt := range tests {

ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)