Skip to content

Commit ead62eb

Browse files
authored
[Cherry-pick][Bug] Ray operator crashes when specifying RayCluster with resources.limits but no resources.requests (#2077) (#2119)
1 parent 8adc538 commit ead62eb

File tree

2 files changed

+91
-0
lines changed

2 files changed

+91
-0
lines changed

ray-operator/controllers/ray/raycluster_controller_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
rbacv1 "k8s.io/api/rbac/v1"
3232
"k8s.io/utils/pointer"
3333

34+
"k8s.io/apimachinery/pkg/api/resource"
3435
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3536
"k8s.io/apimachinery/pkg/types"
3637
"k8s.io/client-go/util/retry"
@@ -598,4 +599,91 @@ var _ = Context("Inside the default namespace", func() {
598599
Expect(len(headPods.Items)).Should(Equal(1), "headPods: %v", headPods.Items)
599600
})
600601
})
602+
603+
Describe("RayCluster without resource request", func() {
604+
ctx := context.Background()
605+
namespace := "default"
606+
rayCluster := rayClusterTemplate("no-resource-req", namespace)
607+
rayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Limits = corev1.ResourceList{
608+
corev1.ResourceCPU: resource.MustParse("1"),
609+
corev1.ResourceMemory: resource.MustParse("1Gi"),
610+
}
611+
rayCluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits = corev1.ResourceList{
612+
corev1.ResourceCPU: resource.MustParse("1"),
613+
corev1.ResourceMemory: resource.MustParse("1Gi"),
614+
}
615+
headPods := corev1.PodList{}
616+
workerPods := corev1.PodList{}
617+
workerFilters := common.RayClusterGroupPodsAssociationOptions(rayCluster, rayCluster.Spec.WorkerGroupSpecs[0].GroupName).ToListOptions()
618+
headFilters := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
619+
620+
It("Verify RayCluster spec", func() {
621+
// These test are designed based on the following assumptions:
622+
// (1) Both head and worker Pods do not have resource requests, but they have resource limits.
623+
// (2) There is only one worker group, and its `replicas` is set to 3.
624+
Expect(rayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests).To(BeNil())
625+
Expect(rayCluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests).To(BeNil())
626+
Expect(rayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Limits).NotTo(BeNil())
627+
Expect(rayCluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits).NotTo(BeNil())
628+
Expect(len(rayCluster.Spec.WorkerGroupSpecs)).To(Equal(1))
629+
Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(pointer.Int32(3)))
630+
})
631+
632+
It("Create a RayCluster custom resource", func() {
633+
err := k8sClient.Create(ctx, rayCluster)
634+
Expect(err).NotTo(HaveOccurred(), "Failed to create RayCluster")
635+
Eventually(
636+
getResourceFunc(ctx, client.ObjectKey{Name: rayCluster.Name, Namespace: namespace}, rayCluster),
637+
time.Second*3, time.Millisecond*500).Should(BeNil(), "Should be able to see RayCluster: %v", rayCluster.Name)
638+
})
639+
640+
It("Check the number of worker Pods", func() {
641+
numWorkerPods := 3
642+
Eventually(
643+
listResourceFunc(ctx, &workerPods, workerFilters...),
644+
time.Second*3, time.Millisecond*500).Should(Equal(numWorkerPods), fmt.Sprintf("workerGroup %v", workerPods.Items))
645+
})
646+
647+
It("Create a head Pod", func() {
648+
err := k8sClient.List(ctx, &headPods, headFilters...)
649+
Expect(err).NotTo(HaveOccurred(), "Failed to list head Pods")
650+
Expect(len(headPods.Items)).Should(Equal(1), "headPods: %v", headPods.Items)
651+
})
652+
653+
It("Update all Pods to Running", func() {
654+
for _, headPod := range headPods.Items {
655+
headPod.Status.Phase = corev1.PodRunning
656+
Expect(k8sClient.Status().Update(ctx, &headPod)).Should(BeNil())
657+
}
658+
659+
Eventually(
660+
isAllPodsRunningByFilters(ctx, headPods, headFilters...),
661+
time.Second*3, time.Millisecond*500).Should(Equal(true), "Head Pod should be running.")
662+
663+
for _, workerPod := range workerPods.Items {
664+
workerPod.Status.Phase = corev1.PodRunning
665+
Expect(k8sClient.Status().Update(ctx, &workerPod)).Should(BeNil())
666+
}
667+
668+
Eventually(
669+
isAllPodsRunningByFilters(ctx, workerPods, workerFilters...),
670+
time.Second*3, time.Millisecond*500).Should(Equal(true), "All worker Pods should be running.")
671+
})
672+
673+
It("RayCluster's .status.state should be updated to 'ready' shortly after all Pods are Running", func() {
674+
Eventually(
675+
getClusterState(ctx, namespace, rayCluster.Name),
676+
time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
677+
})
678+
679+
It("Check DesiredMemory and DesiredCPU", func() {
680+
Eventually(
681+
getResourceFunc(ctx, client.ObjectKey{Name: rayCluster.Name, Namespace: namespace}, rayCluster),
682+
time.Second*3, time.Millisecond*500).Should(BeNil(), "Should be able to see RayCluster: %v", rayCluster.Name)
683+
desiredMemory := resource.MustParse("4Gi")
684+
desiredCPU := resource.MustParse("4")
685+
Expect(rayCluster.Status.DesiredMemory).To(Equal(desiredMemory))
686+
Expect(rayCluster.Status.DesiredCPU).To(Equal(desiredCPU))
687+
})
688+
})
601689
})

ray-operator/controllers/ray/utils/util.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,9 @@ func calculatePodResource(podSpec corev1.PodSpec) corev1.ResourceList {
338338
podResource := corev1.ResourceList{}
339339
for _, container := range podSpec.Containers {
340340
containerResource := container.Resources.Requests
341+
if containerResource == nil {
342+
containerResource = corev1.ResourceList{}
343+
}
341344
for name, quantity := range container.Resources.Limits {
342345
if _, ok := containerResource[name]; !ok {
343346
containerResource[name] = quantity

0 commit comments

Comments
 (0)