@@ -21,9 +21,8 @@ import (
21
21
"time"
22
22
23
23
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
24
- "github.com/ray-project/kuberay/ray-operator/test/support"
25
-
26
24
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
25
+ "github.com/ray-project/kuberay/ray-operator/test/support"
27
26
28
27
. "github.com/onsi/ginkgo/v2"
29
28
. "github.com/onsi/gomega"
@@ -233,6 +232,84 @@ var _ = Context("RayService env tests", func() {
233
232
})
234
233
})
235
234
235
+ Describe ("Autoscaler updates RayCluster should not trigger zero downtime upgrade" , Ordered , func () {
236
+ // If Autoscaler scales up the pending or active RayCluster, zero downtime upgrade should not be triggered.
237
+ ctx := context .Background ()
238
+ namespace := "default"
239
+ serveAppName := "app1"
240
+ rayService := rayServiceTemplate ("test-autoscaler" , namespace , serveAppName )
241
+ rayCluster := & rayv1.RayCluster {}
242
+
243
+ It ("Create a RayService custom resource" , func () {
244
+ err := k8sClient .Create (ctx , rayService )
245
+ Expect (err ).NotTo (HaveOccurred (), "failed to create RayService resource" )
246
+ Eventually (
247
+ getResourceFunc (ctx , client.ObjectKey {Name : rayService .Name , Namespace : namespace }, rayService ),
248
+ time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "RayService: %v" , rayService .Name )
249
+ })
250
+
251
+ It ("Should create a pending RayCluster" , func () {
252
+ Eventually (
253
+ getPreparingRayClusterNameFunc (ctx , rayService ),
254
+ time .Second * 15 , time .Millisecond * 500 ).Should (Not (BeEmpty ()), "Pending RayCluster name: %v" , rayService .Status .PendingServiceStatus .RayClusterName )
255
+ })
256
+
257
+ It ("Autoscaler updates the pending RayCluster and should not switch to a new RayCluster" , func () {
258
+ // Simulate autoscaler by updating the pending RayCluster directly. Note that the autoscaler
259
+ // will not update the RayService directly.
260
+ clusterName , _ := getPreparingRayClusterNameFunc (ctx , rayService )()
261
+ err := retry .RetryOnConflict (retry .DefaultRetry , func () error {
262
+ Eventually (
263
+ getResourceFunc (ctx , client.ObjectKey {Name : clusterName , Namespace : namespace }, rayCluster ),
264
+ time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "Pending RayCluster: %v" , rayCluster .Name )
265
+ * rayCluster .Spec .WorkerGroupSpecs [0 ].Replicas ++
266
+ return k8sClient .Update (ctx , rayCluster )
267
+ })
268
+ Expect (err ).NotTo (HaveOccurred (), "Failed to update the pending RayCluster." )
269
+
270
+ // Confirm not switch to a new RayCluster
271
+ Consistently (
272
+ getPreparingRayClusterNameFunc (ctx , rayService ),
273
+ time .Second * 5 , time .Millisecond * 500 ).Should (Equal (clusterName ), "Pending RayCluster: %v" , rayService .Status .PendingServiceStatus .RayClusterName )
274
+ })
275
+
276
+ It ("Promote the pending RayCluster to the active RayCluster" , func () {
277
+ // Update the status of the head Pod to Running. Note that the default fake dashboard client
278
+ // will return a healthy serve application status.
279
+ pendingRayClusterName := rayService .Status .PendingServiceStatus .RayClusterName
280
+ updateHeadPodToRunningAndReady (ctx , pendingRayClusterName , namespace )
281
+
282
+ // Make sure the pending RayCluster becomes the active RayCluster.
283
+ Eventually (
284
+ getRayClusterNameFunc (ctx , rayService ),
285
+ time .Second * 15 , time .Millisecond * 500 ).Should (Equal (pendingRayClusterName ), "Active RayCluster name: %v" , rayService .Status .ActiveServiceStatus .RayClusterName )
286
+
287
+ // Initialize RayCluster for the following tests.
288
+ Eventually (
289
+ getResourceFunc (ctx , client.ObjectKey {Name : rayService .Status .ActiveServiceStatus .RayClusterName , Namespace : namespace }, rayCluster ),
290
+ time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "RayCluster: %v" , rayCluster .Name )
291
+ })
292
+
293
+ It ("Autoscaler updates the active RayCluster and should not switch to a new RayCluster" , func () {
294
+ // Simulate autoscaler by updating the active RayCluster directly. Note that the autoscaler
295
+ // will not update the RayService directly.
296
+ clusterName , _ := getRayClusterNameFunc (ctx , rayService )()
297
+ err := retry .RetryOnConflict (retry .DefaultRetry , func () error {
298
+ Eventually (
299
+ getResourceFunc (ctx , client.ObjectKey {Name : clusterName , Namespace : namespace }, rayCluster ),
300
+ time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "Active RayCluster: %v" , rayCluster .Name )
301
+ * rayCluster .Spec .WorkerGroupSpecs [0 ].Replicas ++
302
+ return k8sClient .Update (ctx , rayCluster )
303
+ })
304
+ Expect (err ).NotTo (HaveOccurred (), "Failed to update the active RayCluster." )
305
+
306
+ // Confirm not switch to a new RayCluster
307
+ Consistently (
308
+ getRayClusterNameFunc (ctx , rayService ),
309
+ time .Second * 5 , time .Millisecond * 500 ).Should (Equal (clusterName ), "Active RayCluster: %v" , rayService .Status .ActiveServiceStatus .RayClusterName )
310
+ })
311
+ })
312
+
236
313
Describe ("When creating a rayservice" , Ordered , func () {
237
314
ctx := context .TODO ()
238
315
var workerPods corev1.PodList
@@ -343,85 +420,6 @@ var _ = Context("RayService env tests", func() {
343
420
Expect (meta .IsStatusConditionFalse (rayService .Status .Conditions , string (rayv1 .UpgradeInProgress ))).Should (BeTrue ())
344
421
})
345
422
346
- It ("Autoscaler updates the active RayCluster and should not switch to a new RayCluster" , func () {
347
- // Simulate autoscaler by updating the active RayCluster directly. Note that the autoscaler
348
- // will not update the RayService directly.
349
- initialClusterName , _ := getRayClusterNameFunc (ctx , rayService )()
350
- err := retry .RetryOnConflict (retry .DefaultRetry , func () error {
351
- Eventually (
352
- getResourceFunc (ctx , client.ObjectKey {Name : initialClusterName , Namespace : "default" }, myRayCluster ),
353
- time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "Active RayCluster = %v" , myRayCluster .Name )
354
- podToDelete := workerPods .Items [0 ]
355
- * myRayCluster .Spec .WorkerGroupSpecs [0 ].Replicas ++
356
- myRayCluster .Spec .WorkerGroupSpecs [0 ].ScaleStrategy .WorkersToDelete = []string {podToDelete .Name }
357
- return k8sClient .Update (ctx , myRayCluster )
358
- })
359
- Expect (err ).NotTo (HaveOccurred (), "failed to update test RayCluster" )
360
-
361
- // Confirm not switch to a new RayCluster
362
- Consistently (
363
- getRayClusterNameFunc (ctx , rayService ),
364
- time .Second * 5 , time .Millisecond * 500 ).Should (Equal (initialClusterName ), "My current RayCluster name = %v" , rayService .Status .ActiveServiceStatus .RayClusterName )
365
- Eventually (
366
- getResourceFunc (ctx , client.ObjectKey {Name : rayService .Status .ActiveServiceStatus .RayClusterName , Namespace : "default" }, myRayCluster ),
367
- time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "My myRayCluster = %v" , myRayCluster .Name )
368
-
369
- cleanUpWorkersToDelete (ctx , myRayCluster )
370
- })
371
-
372
- It ("Autoscaler updates the pending RayCluster and should not switch to a new RayCluster" , func () {
373
- // Simulate autoscaler by updating the pending RayCluster directly. Note that the autoscaler
374
- // will not update the RayService directly.
375
-
376
- // Trigger a new RayCluster preparation by updating the RayVersion.
377
- oldRayVersion := rayService .Spec .RayClusterSpec .RayVersion
378
- newRayVersion := "2.200.0"
379
- Expect (oldRayVersion ).ShouldNot (Equal (newRayVersion ))
380
- err := retry .RetryOnConflict (retry .DefaultRetry , func () error {
381
- Eventually (
382
- getResourceFunc (ctx , client.ObjectKey {Name : rayService .Name , Namespace : "default" }, rayService ),
383
- time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "My myRayService = %v" , rayService .Name )
384
- rayService .Spec .RayClusterSpec .RayVersion = newRayVersion
385
- return k8sClient .Update (ctx , rayService )
386
- })
387
- Expect (err ).NotTo (HaveOccurred (), "failed to update test RayService resource" )
388
- Eventually (
389
- getPreparingRayClusterNameFunc (ctx , rayService ),
390
- time .Second * 60 , time .Millisecond * 500 ).Should (Not (BeEmpty ()), "New pending RayCluster name = %v" , rayService .Status .PendingServiceStatus .RayClusterName )
391
- initialPendingClusterName , _ := getPreparingRayClusterNameFunc (ctx , rayService )()
392
-
393
- // Simulate that the pending RayCluster is updated by the autoscaler.
394
- err = retry .RetryOnConflict (retry .DefaultRetry , func () error {
395
- Eventually (
396
- getResourceFunc (ctx , client.ObjectKey {Name : initialPendingClusterName , Namespace : "default" }, myRayCluster ),
397
- time .Second * 15 , time .Millisecond * 500 ).Should (BeNil (), "Pending RayCluster = %v" , myRayCluster .Name )
398
- podToDelete := workerPods .Items [0 ]
399
- * myRayCluster .Spec .WorkerGroupSpecs [0 ].Replicas ++
400
- myRayCluster .Spec .WorkerGroupSpecs [0 ].ScaleStrategy .WorkersToDelete = []string {podToDelete .Name }
401
- return k8sClient .Update (ctx , myRayCluster )
402
- })
403
- Expect (err ).NotTo (HaveOccurred (), "Failed to update the pending RayCluster." )
404
-
405
- // Confirm not switch to a new RayCluster when the pending RayCluster triggers autoscaler.
406
- Consistently (
407
- getPreparingRayClusterNameFunc (ctx , rayService ),
408
- time .Second * 5 , time .Millisecond * 500 ).Should (Equal (initialPendingClusterName ), "Pending RayCluster name = %v" , rayService .Status .PendingServiceStatus .RayClusterName )
409
-
410
- // The pending RayCluster will become the active RayCluster after:
411
- // (1) The pending RayCluster's head Pod becomes Running and Ready
412
- // (2) The pending RayCluster's Serve Deployments are HEALTHY.
413
- updateHeadPodToRunningAndReady (ctx , initialPendingClusterName , "default" )
414
- healthyStatus := generateServeStatus (rayv1 .DeploymentStatusEnum .HEALTHY , rayv1 .ApplicationStatusEnum .RUNNING )
415
- fakeRayDashboardClient .SetMultiApplicationStatuses (map [string ]* utils.ServeApplicationStatus {serveAppName : & healthyStatus })
416
- Eventually (
417
- getPreparingRayClusterNameFunc (ctx , rayService ),
418
- time .Second * 15 , time .Millisecond * 500 ).Should (BeEmpty (), "Pending RayCluster name = %v" , rayService .Status .PendingServiceStatus .RayClusterName )
419
- Eventually (
420
- getRayClusterNameFunc (ctx , rayService ),
421
- time .Second * 15 , time .Millisecond * 500 ).Should (Equal (initialPendingClusterName ), "New active RayCluster name = %v" , rayService .Status .ActiveServiceStatus .RayClusterName )
422
-
423
- cleanUpWorkersToDelete (ctx , myRayCluster )
424
- })
425
423
It ("should update the active RayCluster in place when WorkerGroupSpecs are modified by the user in RayServiceSpec" , func () {
426
424
initialClusterName , _ := getRayClusterNameFunc (ctx , rayService )()
427
425
oldNumWorkerGroupSpecs := len (rayService .Spec .RayClusterSpec .WorkerGroupSpecs )
@@ -553,29 +551,6 @@ var _ = Context("RayService env tests", func() {
553
551
time .Second * 3 , time .Millisecond * 500 ).Should (BeTrue (), "myRayService status = %v" , rayService .Status )
554
552
})
555
553
556
- It ("Update workerGroup.replicas in RayService and should not switch to new Ray Cluster" , func () {
557
- // Certain field updates should not trigger new RayCluster preparation, such as updates
558
- // to `Replicas` and `WorkersToDelete` triggered by the autoscaler during scaling up/down.
559
- // See the function `generateRayClusterJsonHash` for more details.
560
- initialClusterName , _ := getRayClusterNameFunc (ctx , rayService )()
561
- err := retry .RetryOnConflict (retry .DefaultRetry , func () error {
562
- Eventually (
563
- getResourceFunc (ctx , client.ObjectKey {Name : rayService .Name , Namespace : "default" }, rayService ),
564
- time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "My myRayService = %v" , rayService .Name )
565
- * rayService .Spec .RayClusterSpec .WorkerGroupSpecs [0 ].Replicas ++
566
- return k8sClient .Update (ctx , rayService )
567
- })
568
- Expect (err ).NotTo (HaveOccurred (), "failed to update test RayService resource" )
569
-
570
- // Confirm not switch to a new RayCluster
571
- Consistently (
572
- getRayClusterNameFunc (ctx , rayService ),
573
- time .Second * 5 , time .Millisecond * 500 ).Should (Equal (initialClusterName ), "My current RayCluster name = %v" , rayService .Status .ActiveServiceStatus .RayClusterName )
574
- Eventually (
575
- getResourceFunc (ctx , client.ObjectKey {Name : rayService .Status .ActiveServiceStatus .RayClusterName , Namespace : "default" }, myRayCluster ),
576
- time .Second * 3 , time .Millisecond * 500 ).Should (BeNil (), "My myRayCluster = %v" , myRayCluster .Name )
577
- })
578
-
579
554
It ("should perform a zero-downtime update after a code change." , func () {
580
555
initialClusterName , _ := getRayClusterNameFunc (ctx , rayService )()
581
556
0 commit comments