Skip to content

Commit c807790

Browse files
authored
[RayService] Add an envtest for autoscaler (#2872)
1 parent 77a1023 commit c807790

File tree

1 file changed

+79
-104
lines changed

1 file changed

+79
-104
lines changed

ray-operator/controllers/ray/rayservice_controller_test.go

Lines changed: 79 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,8 @@ import (
2121
"time"
2222

2323
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
24-
"github.com/ray-project/kuberay/ray-operator/test/support"
25-
2624
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
25+
"github.com/ray-project/kuberay/ray-operator/test/support"
2726

2827
. "github.com/onsi/ginkgo/v2"
2928
. "github.com/onsi/gomega"
@@ -233,6 +232,84 @@ var _ = Context("RayService env tests", func() {
233232
})
234233
})
235234

235+
Describe("Autoscaler updates RayCluster should not trigger zero downtime upgrade", Ordered, func() {
236+
// If Autoscaler scales up the pending or active RayCluster, zero downtime upgrade should not be triggered.
237+
ctx := context.Background()
238+
namespace := "default"
239+
serveAppName := "app1"
240+
rayService := rayServiceTemplate("test-autoscaler", namespace, serveAppName)
241+
rayCluster := &rayv1.RayCluster{}
242+
243+
It("Create a RayService custom resource", func() {
244+
err := k8sClient.Create(ctx, rayService)
245+
Expect(err).NotTo(HaveOccurred(), "failed to create RayService resource")
246+
Eventually(
247+
getResourceFunc(ctx, client.ObjectKey{Name: rayService.Name, Namespace: namespace}, rayService),
248+
time.Second*3, time.Millisecond*500).Should(BeNil(), "RayService: %v", rayService.Name)
249+
})
250+
251+
It("Should create a pending RayCluster", func() {
252+
Eventually(
253+
getPreparingRayClusterNameFunc(ctx, rayService),
254+
time.Second*15, time.Millisecond*500).Should(Not(BeEmpty()), "Pending RayCluster name: %v", rayService.Status.PendingServiceStatus.RayClusterName)
255+
})
256+
257+
It("Autoscaler updates the pending RayCluster and should not switch to a new RayCluster", func() {
258+
// Simulate autoscaler by updating the pending RayCluster directly. Note that the autoscaler
259+
// will not update the RayService directly.
260+
clusterName, _ := getPreparingRayClusterNameFunc(ctx, rayService)()
261+
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
262+
Eventually(
263+
getResourceFunc(ctx, client.ObjectKey{Name: clusterName, Namespace: namespace}, rayCluster),
264+
time.Second*3, time.Millisecond*500).Should(BeNil(), "Pending RayCluster: %v", rayCluster.Name)
265+
*rayCluster.Spec.WorkerGroupSpecs[0].Replicas++
266+
return k8sClient.Update(ctx, rayCluster)
267+
})
268+
Expect(err).NotTo(HaveOccurred(), "Failed to update the pending RayCluster.")
269+
270+
// Confirm not switch to a new RayCluster
271+
Consistently(
272+
getPreparingRayClusterNameFunc(ctx, rayService),
273+
time.Second*5, time.Millisecond*500).Should(Equal(clusterName), "Pending RayCluster: %v", rayService.Status.PendingServiceStatus.RayClusterName)
274+
})
275+
276+
It("Promote the pending RayCluster to the active RayCluster", func() {
277+
// Update the status of the head Pod to Running. Note that the default fake dashboard client
278+
// will return a healthy serve application status.
279+
pendingRayClusterName := rayService.Status.PendingServiceStatus.RayClusterName
280+
updateHeadPodToRunningAndReady(ctx, pendingRayClusterName, namespace)
281+
282+
// Make sure the pending RayCluster becomes the active RayCluster.
283+
Eventually(
284+
getRayClusterNameFunc(ctx, rayService),
285+
time.Second*15, time.Millisecond*500).Should(Equal(pendingRayClusterName), "Active RayCluster name: %v", rayService.Status.ActiveServiceStatus.RayClusterName)
286+
287+
// Initialize RayCluster for the following tests.
288+
Eventually(
289+
getResourceFunc(ctx, client.ObjectKey{Name: rayService.Status.ActiveServiceStatus.RayClusterName, Namespace: namespace}, rayCluster),
290+
time.Second*3, time.Millisecond*500).Should(BeNil(), "RayCluster: %v", rayCluster.Name)
291+
})
292+
293+
It("Autoscaler updates the active RayCluster and should not switch to a new RayCluster", func() {
294+
// Simulate autoscaler by updating the active RayCluster directly. Note that the autoscaler
295+
// will not update the RayService directly.
296+
clusterName, _ := getRayClusterNameFunc(ctx, rayService)()
297+
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
298+
Eventually(
299+
getResourceFunc(ctx, client.ObjectKey{Name: clusterName, Namespace: namespace}, rayCluster),
300+
time.Second*3, time.Millisecond*500).Should(BeNil(), "Active RayCluster: %v", rayCluster.Name)
301+
*rayCluster.Spec.WorkerGroupSpecs[0].Replicas++
302+
return k8sClient.Update(ctx, rayCluster)
303+
})
304+
Expect(err).NotTo(HaveOccurred(), "Failed to update the active RayCluster.")
305+
306+
// Confirm not switch to a new RayCluster
307+
Consistently(
308+
getRayClusterNameFunc(ctx, rayService),
309+
time.Second*5, time.Millisecond*500).Should(Equal(clusterName), "Active RayCluster: %v", rayService.Status.ActiveServiceStatus.RayClusterName)
310+
})
311+
})
312+
236313
Describe("When creating a rayservice", Ordered, func() {
237314
ctx := context.TODO()
238315
var workerPods corev1.PodList
@@ -343,85 +420,6 @@ var _ = Context("RayService env tests", func() {
343420
Expect(meta.IsStatusConditionFalse(rayService.Status.Conditions, string(rayv1.UpgradeInProgress))).Should(BeTrue())
344421
})
345422

346-
It("Autoscaler updates the active RayCluster and should not switch to a new RayCluster", func() {
347-
// Simulate autoscaler by updating the active RayCluster directly. Note that the autoscaler
348-
// will not update the RayService directly.
349-
initialClusterName, _ := getRayClusterNameFunc(ctx, rayService)()
350-
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
351-
Eventually(
352-
getResourceFunc(ctx, client.ObjectKey{Name: initialClusterName, Namespace: "default"}, myRayCluster),
353-
time.Second*3, time.Millisecond*500).Should(BeNil(), "Active RayCluster = %v", myRayCluster.Name)
354-
podToDelete := workerPods.Items[0]
355-
*myRayCluster.Spec.WorkerGroupSpecs[0].Replicas++
356-
myRayCluster.Spec.WorkerGroupSpecs[0].ScaleStrategy.WorkersToDelete = []string{podToDelete.Name}
357-
return k8sClient.Update(ctx, myRayCluster)
358-
})
359-
Expect(err).NotTo(HaveOccurred(), "failed to update test RayCluster")
360-
361-
// Confirm not switch to a new RayCluster
362-
Consistently(
363-
getRayClusterNameFunc(ctx, rayService),
364-
time.Second*5, time.Millisecond*500).Should(Equal(initialClusterName), "My current RayCluster name = %v", rayService.Status.ActiveServiceStatus.RayClusterName)
365-
Eventually(
366-
getResourceFunc(ctx, client.ObjectKey{Name: rayService.Status.ActiveServiceStatus.RayClusterName, Namespace: "default"}, myRayCluster),
367-
time.Second*3, time.Millisecond*500).Should(BeNil(), "My myRayCluster = %v", myRayCluster.Name)
368-
369-
cleanUpWorkersToDelete(ctx, myRayCluster)
370-
})
371-
372-
It("Autoscaler updates the pending RayCluster and should not switch to a new RayCluster", func() {
373-
// Simulate autoscaler by updating the pending RayCluster directly. Note that the autoscaler
374-
// will not update the RayService directly.
375-
376-
// Trigger a new RayCluster preparation by updating the RayVersion.
377-
oldRayVersion := rayService.Spec.RayClusterSpec.RayVersion
378-
newRayVersion := "2.200.0"
379-
Expect(oldRayVersion).ShouldNot(Equal(newRayVersion))
380-
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
381-
Eventually(
382-
getResourceFunc(ctx, client.ObjectKey{Name: rayService.Name, Namespace: "default"}, rayService),
383-
time.Second*3, time.Millisecond*500).Should(BeNil(), "My myRayService = %v", rayService.Name)
384-
rayService.Spec.RayClusterSpec.RayVersion = newRayVersion
385-
return k8sClient.Update(ctx, rayService)
386-
})
387-
Expect(err).NotTo(HaveOccurred(), "failed to update test RayService resource")
388-
Eventually(
389-
getPreparingRayClusterNameFunc(ctx, rayService),
390-
time.Second*60, time.Millisecond*500).Should(Not(BeEmpty()), "New pending RayCluster name = %v", rayService.Status.PendingServiceStatus.RayClusterName)
391-
initialPendingClusterName, _ := getPreparingRayClusterNameFunc(ctx, rayService)()
392-
393-
// Simulate that the pending RayCluster is updated by the autoscaler.
394-
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
395-
Eventually(
396-
getResourceFunc(ctx, client.ObjectKey{Name: initialPendingClusterName, Namespace: "default"}, myRayCluster),
397-
time.Second*15, time.Millisecond*500).Should(BeNil(), "Pending RayCluster = %v", myRayCluster.Name)
398-
podToDelete := workerPods.Items[0]
399-
*myRayCluster.Spec.WorkerGroupSpecs[0].Replicas++
400-
myRayCluster.Spec.WorkerGroupSpecs[0].ScaleStrategy.WorkersToDelete = []string{podToDelete.Name}
401-
return k8sClient.Update(ctx, myRayCluster)
402-
})
403-
Expect(err).NotTo(HaveOccurred(), "Failed to update the pending RayCluster.")
404-
405-
// Confirm not switch to a new RayCluster when the pending RayCluster triggers autoscaler.
406-
Consistently(
407-
getPreparingRayClusterNameFunc(ctx, rayService),
408-
time.Second*5, time.Millisecond*500).Should(Equal(initialPendingClusterName), "Pending RayCluster name = %v", rayService.Status.PendingServiceStatus.RayClusterName)
409-
410-
// The pending RayCluster will become the active RayCluster after:
411-
// (1) The pending RayCluster's head Pod becomes Running and Ready
412-
// (2) The pending RayCluster's Serve Deployments are HEALTHY.
413-
updateHeadPodToRunningAndReady(ctx, initialPendingClusterName, "default")
414-
healthyStatus := generateServeStatus(rayv1.DeploymentStatusEnum.HEALTHY, rayv1.ApplicationStatusEnum.RUNNING)
415-
fakeRayDashboardClient.SetMultiApplicationStatuses(map[string]*utils.ServeApplicationStatus{serveAppName: &healthyStatus})
416-
Eventually(
417-
getPreparingRayClusterNameFunc(ctx, rayService),
418-
time.Second*15, time.Millisecond*500).Should(BeEmpty(), "Pending RayCluster name = %v", rayService.Status.PendingServiceStatus.RayClusterName)
419-
Eventually(
420-
getRayClusterNameFunc(ctx, rayService),
421-
time.Second*15, time.Millisecond*500).Should(Equal(initialPendingClusterName), "New active RayCluster name = %v", rayService.Status.ActiveServiceStatus.RayClusterName)
422-
423-
cleanUpWorkersToDelete(ctx, myRayCluster)
424-
})
425423
It("should update the active RayCluster in place when WorkerGroupSpecs are modified by the user in RayServiceSpec", func() {
426424
initialClusterName, _ := getRayClusterNameFunc(ctx, rayService)()
427425
oldNumWorkerGroupSpecs := len(rayService.Spec.RayClusterSpec.WorkerGroupSpecs)
@@ -553,29 +551,6 @@ var _ = Context("RayService env tests", func() {
553551
time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", rayService.Status)
554552
})
555553

556-
It("Update workerGroup.replicas in RayService and should not switch to new Ray Cluster", func() {
557-
// Certain field updates should not trigger new RayCluster preparation, such as updates
558-
// to `Replicas` and `WorkersToDelete` triggered by the autoscaler during scaling up/down.
559-
// See the function `generateRayClusterJsonHash` for more details.
560-
initialClusterName, _ := getRayClusterNameFunc(ctx, rayService)()
561-
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
562-
Eventually(
563-
getResourceFunc(ctx, client.ObjectKey{Name: rayService.Name, Namespace: "default"}, rayService),
564-
time.Second*3, time.Millisecond*500).Should(BeNil(), "My myRayService = %v", rayService.Name)
565-
*rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas++
566-
return k8sClient.Update(ctx, rayService)
567-
})
568-
Expect(err).NotTo(HaveOccurred(), "failed to update test RayService resource")
569-
570-
// Confirm not switch to a new RayCluster
571-
Consistently(
572-
getRayClusterNameFunc(ctx, rayService),
573-
time.Second*5, time.Millisecond*500).Should(Equal(initialClusterName), "My current RayCluster name = %v", rayService.Status.ActiveServiceStatus.RayClusterName)
574-
Eventually(
575-
getResourceFunc(ctx, client.ObjectKey{Name: rayService.Status.ActiveServiceStatus.RayClusterName, Namespace: "default"}, myRayCluster),
576-
time.Second*3, time.Millisecond*500).Should(BeNil(), "My myRayCluster = %v", myRayCluster.Name)
577-
})
578-
579554
It("should perform a zero-downtime update after a code change.", func() {
580555
initialClusterName, _ := getRayClusterNameFunc(ctx, rayService)()
581556

0 commit comments

Comments
 (0)