Skip to content

Commit 04f5b71

Browse files
authored
[RayCluster] Toggle usage of deterministic/non-deterministic head pod name with feature flag (#3873)
* feat: use gen name for ray version > 2.48 Signed-off-by: machichima <[email protected]> * feat: head pod name to non deterministic Signed-off-by: machichima <[email protected]> * test: compare head pod name between old and new Signed-off-by: machichima <[email protected]> * feat: add feature flag for deterministic head pod name Signed-off-by: machichima <[email protected]> * feat: use deterministic if feature flag set to true Signed-off-by: machichima <[email protected]> * test: head pod name with/without feature flag == true Signed-off-by: machichima <[email protected]> * test: sep test for head/worker pod name Signed-off-by: machichima <[email protected]> * docs: add feature flag desc to helm chart Signed-off-by: machichima <[email protected]> * fix: feature flag name Signed-off-by: machichima <[email protected]> * refactor: simplify check if the feature flag is set Signed-off-by: machichima <[email protected]> --------- Signed-off-by: machichima <[email protected]>
1 parent 62ad934 commit 04f5b71

File tree

6 files changed

+89
-19
lines changed

6 files changed

+89
-19
lines changed

helm-chart/kuberay-operator/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,9 @@ env:
176176
# If set to true, the RayJob CR itself will be deleted if shutdownAfterJobFinishes is set to true. Note that all resources created by the RayJob CR will be deleted, including the K8s Job. Otherwise, only the RayCluster CR will be deleted. Default is false.
177177
# - name: DELETE_RAYJOB_CR_AFTER_JOB_FINISHES
178178
# value: "false"
179+
# If set to true, we will use deterministic name for head pod. Otherwise, the non-deterministic name is used.
180+
# - name: ENABLE_DETERMINISTIC_HEAD_POD_NAME
181+
# value: "false"
179182

180183
# -- Resource requests and limits for containers.
181184
resources:

ray-operator/controllers/ray/common/pod.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,11 @@ func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, head
164164
// headPort is passed into setMissingRayStartParams but unused there for the head pod.
165165
// To mitigate this awkwardness and reduce code redundancy, unify head and worker pod configuration logic.
166166
podTemplate := headSpec.Template
167-
podTemplate.GenerateName = podName
167+
if utils.IsDeterministicHeadPodNameEnabled() {
168+
podTemplate.Name = podName
169+
} else {
170+
podTemplate.GenerateName = podName
171+
}
168172
// Pods created by RayCluster should be restricted to the namespace of the RayCluster.
169173
// This ensures privilege of KubeRay users are contained within the namespace of the RayCluster.
170174
podTemplate.ObjectMeta.Namespace = instance.Namespace

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -980,7 +980,7 @@ func (r *RayClusterReconciler) createWorkerPod(ctx context.Context, instance ray
980980
// Build head instance pod(s).
981981
func (r *RayClusterReconciler) buildHeadPod(ctx context.Context, instance rayv1.RayCluster) corev1.Pod {
982982
logger := ctrl.LoggerFrom(ctx)
983-
podName := utils.PodName(instance.Name, rayv1.HeadNode, true)
983+
podName := utils.PodName(instance.Name, rayv1.HeadNode, !utils.IsDeterministicHeadPodNameEnabled())
984984
fqdnRayIP := utils.GenerateFQDNServiceName(ctx, instance, instance.Namespace) // Fully Qualified Domain Name
985985

986986
// The Ray head port used by workers to connect to the cluster (GCS server port for Ray >= 1.11.0, Redis port for older Ray.)

ray-operator/controllers/ray/utils/constant.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,9 @@ const (
164164
// by default starting with v1.4.0.
165165
ENABLE_LOGIN_SHELL = "ENABLE_LOGIN_SHELL"
166166

167+
// If set to true, we will use deterministic name for head pod. Otherwise, the non-deterministic name is used.
168+
ENABLE_DETERMINISTIC_HEAD_POD_NAME = "ENABLE_DETERMINISTIC_HEAD_POD_NAME"
169+
167170
// Ray core default configurations
168171
DefaultWorkerRayGcsReconnectTimeoutS = "600"
169172

ray-operator/controllers/ray/utils/util.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,3 +712,8 @@ func GetContainerCommand(additionalOptions []string) []string {
712712
bashOptionsStr := strings.Join(bashOptions, "")
713713
return []string{"/bin/bash", "-" + bashOptionsStr, "--"}
714714
}
715+
716+
// GetEnableDeterministicHeadName returns true if deterministic head pod name is enabled.
717+
func IsDeterministicHeadPodNameEnabled() bool {
718+
return strings.ToLower(os.Getenv(ENABLE_DETERMINISTIC_HEAD_POD_NAME)) == "true"
719+
}

ray-operator/controllers/ray/utils/util_test.go

Lines changed: 72 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -104,42 +104,97 @@ func TestCheckAllPodsRunning(t *testing.T) {
104104
}
105105
}
106106

107-
func TestPodName(t *testing.T) {
107+
func TestWorkerPodName(t *testing.T) {
108108
tests := []struct {
109109
name string
110110
prefix string
111-
nodeType rayv1.RayNodeType
112111
expected string
113112
}{
114-
{
115-
name: "short cluster name, head pod",
116-
prefix: "ray-cluster-01",
117-
nodeType: rayv1.HeadNode,
118-
expected: "ray-cluster-01-head-",
119-
},
120113
{
121114
name: "short cluster name, worker pod",
122115
prefix: "ray-cluster-group-name-01",
123-
nodeType: rayv1.WorkerNode,
124116
expected: "ray-cluster-group-name-01-worker-",
125117
},
126-
{
127-
name: "long cluster name, head pod",
128-
prefix: "ray-cluster-0000000000000000000000011111111122222233333333333333",
129-
nodeType: rayv1.HeadNode,
130-
expected: "ray-cluster-00000000000000000000000111111111222222-head-",
131-
},
132118
{
133119
name: "long cluster name, worker pod",
134120
prefix: "ray-cluster-0000000000000000000000011111111122222233333333333333-group-name",
135-
nodeType: rayv1.WorkerNode,
136121
expected: "ray-cluster-00000000000000000000000111111111222222-worker-",
137122
},
138123
}
139124

140125
for _, test := range tests {
141126
t.Run(test.name, func(t *testing.T) {
142-
str := PodName(test.prefix, test.nodeType, true)
127+
str := PodName(test.prefix, rayv1.WorkerNode, true)
128+
if str != test.expected {
129+
t.Logf("expected: %q", test.expected)
130+
t.Logf("actual: %q", str)
131+
t.Error("PodName returned an unexpected string")
132+
}
133+
134+
// 63 (max pod name length) - 5 random hexadecimal characters from generateName
135+
if len(str) > 58 {
136+
t.Error("Generated pod name is too long")
137+
}
138+
})
139+
}
140+
}
141+
142+
func TestHeadPodName(t *testing.T) {
143+
defer os.Unsetenv(ENABLE_DETERMINISTIC_HEAD_POD_NAME)
144+
145+
tests := []struct {
146+
name string
147+
prefix string
148+
enableDeterministicHeadPod string
149+
expected string
150+
}{
151+
{
152+
name: "short cluster name, deterministic head pod name",
153+
prefix: "ray-cluster-01",
154+
enableDeterministicHeadPod: "true",
155+
expected: "ray-cluster-01-head",
156+
},
157+
{
158+
name: "short cluster name, non-deterministic head pod name",
159+
prefix: "ray-cluster-01",
160+
enableDeterministicHeadPod: "false",
161+
expected: "ray-cluster-01-head-",
162+
},
163+
{
164+
name: "short cluster name, feature flag not set",
165+
prefix: "ray-cluster-01",
166+
enableDeterministicHeadPod: "unset",
167+
expected: "ray-cluster-01-head-",
168+
},
169+
{
170+
name: "long cluster name, deterministic head pod name",
171+
prefix: "ray-cluster-0000000000000000000000011111111122222233333333333333",
172+
enableDeterministicHeadPod: "true",
173+
expected: "ray-cluster-00000000000000000000000111111111222222-head",
174+
},
175+
{
176+
name: "long cluster name, non-deterministic head pod name",
177+
prefix: "ray-cluster-0000000000000000000000011111111122222233333333333333",
178+
enableDeterministicHeadPod: "false",
179+
expected: "ray-cluster-00000000000000000000000111111111222222-head-",
180+
},
181+
{
182+
name: "long cluster name, feature flag not set",
183+
prefix: "ray-cluster-0000000000000000000000011111111122222233333333333333",
184+
enableDeterministicHeadPod: "unset",
185+
expected: "ray-cluster-00000000000000000000000111111111222222-head-",
186+
},
187+
}
188+
189+
for _, test := range tests {
190+
t.Run(test.name, func(t *testing.T) {
191+
if test.enableDeterministicHeadPod == "unset" {
192+
os.Unsetenv(ENABLE_DETERMINISTIC_HEAD_POD_NAME)
193+
} else {
194+
os.Setenv(ENABLE_DETERMINISTIC_HEAD_POD_NAME, test.enableDeterministicHeadPod)
195+
}
196+
197+
str := PodName(test.prefix, rayv1.HeadNode, !IsDeterministicHeadPodNameEnabled())
143198
if str != test.expected {
144199
t.Logf("expected: %q", test.expected)
145200
t.Logf("actual: %q", str)

0 commit comments

Comments
 (0)