Skip to content

Commit 7e07ea4

Browse files
authored
[Cherry-pick][Bug] All worker Pods are deleted if using KubeRay v1.0.0 CRD with KubeRay operator v1.1.0 image (#2087) (#2120)
1 parent c75f3c4 commit 7e07ea4

File tree

2 files changed

+45
-1
lines changed

2 files changed

+45
-1
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -796,10 +796,15 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
796796
}
797797
}
798798
// A replica can contain multiple hosts, so we need to calculate this based on the number of hosts per replica.
799+
// If the user doesn't install the CRD with `NumOfHosts`, the zero value of `NumOfHosts`, which is 0, will be used.
800+
// Hence, all workers will be deleted. Here, we set `NumOfHosts` to max(1, `NumOfHosts`) to avoid this situation.
801+
if worker.NumOfHosts <= 0 {
802+
worker.NumOfHosts = 1
803+
}
799804
numExpectedPods := workerReplicas * worker.NumOfHosts
800805
diff := numExpectedPods - int32(len(runningPods.Items))
801806

802-
logger.Info("reconcilePods", "workerReplicas", workerReplicas, "runningPods", len(runningPods.Items), "diff", diff)
807+
logger.Info("reconcilePods", "workerReplicas", workerReplicas, "NumOfHosts", worker.NumOfHosts, "runningPods", len(runningPods.Items), "diff", diff)
803808

804809
if diff > 0 {
805810
// pods need to be added

ray-operator/controllers/ray/raycluster_controller_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -686,4 +686,43 @@ var _ = Context("Inside the default namespace", func() {
686686
Expect(rayCluster.Status.DesiredCPU).To(Equal(desiredCPU))
687687
})
688688
})
689+
690+
Describe("RayCluster with invalid NumOfHosts", func() {
691+
// Some users only upgrade the KubeRay image without upgrading the CRD. For example, when a
692+
// user upgrades the KubeRay operator from v1.0.0 to v1.1.0 without upgrading the CRD, the
693+
// KubeRay operator will use the zero value of `NumOfHosts` in the CRD. Hence, all worker
694+
// Pods will be deleted. This test case is designed to prevent Pods from being deleted.
695+
ctx := context.Background()
696+
namespace := "default"
697+
rayCluster := rayClusterTemplate("raycluster-invalid-numofhosts", namespace)
698+
numOfHosts := int32(0)
699+
rayCluster.Spec.WorkerGroupSpecs[0].NumOfHosts = numOfHosts
700+
workerPods := corev1.PodList{}
701+
workerFilters := common.RayClusterGroupPodsAssociationOptions(rayCluster, rayCluster.Spec.WorkerGroupSpecs[0].GroupName).ToListOptions()
702+
703+
It("Verify RayCluster spec", func() {
704+
// These test are designed based on the following assumptions:
705+
// (1) There is only one worker group, and its `replicas` is set to 3, and `workersToDelete` is empty.
706+
// (2) The worker group has an invalid `numOfHosts` value of 0.
707+
Expect(len(rayCluster.Spec.WorkerGroupSpecs)).To(Equal(1))
708+
Expect(rayCluster.Spec.WorkerGroupSpecs[0].NumOfHosts).To(Equal(numOfHosts))
709+
Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(pointer.Int32(3)))
710+
Expect(rayCluster.Spec.WorkerGroupSpecs[0].ScaleStrategy.WorkersToDelete).To(BeEmpty())
711+
})
712+
713+
It("Create a RayCluster custom resource", func() {
714+
err := k8sClient.Create(ctx, rayCluster)
715+
Expect(err).NotTo(HaveOccurred(), "Failed to create RayCluster")
716+
Eventually(
717+
getResourceFunc(ctx, client.ObjectKey{Name: rayCluster.Name, Namespace: namespace}, rayCluster),
718+
time.Second*3, time.Millisecond*500).Should(BeNil(), "Should be able to see RayCluster: %v", rayCluster.Name)
719+
})
720+
721+
It("Check the number of worker Pods", func() {
722+
numWorkerPods := 3 * int(numOfHosts)
723+
Eventually(
724+
listResourceFunc(ctx, &workerPods, workerFilters...),
725+
time.Second*3, time.Millisecond*500).Should(Equal(numWorkerPods), fmt.Sprintf("workerGroup %v", workerPods.Items))
726+
})
727+
})
689728
})

0 commit comments

Comments
 (0)