Skip to content

Commit 64da63b

Browse files
aviadshimonirueian
andauthored
[RayService] Trim Redis Cleanup job less than 63 chars (#2846)
* redis cleanup * small comments change * 1. trim labels value like here: https://github.com/ray-project/kuberay/blob/master/ray-operator/controllers/ray/common/service.go#L27 2. interface function TrimJobName for clarity 3. Keep job for 3 minutes after completion for debugging * Update ray-operator/controllers/ray/raycluster_controller.go Co-authored-by: Rueian <[email protected]> Signed-off-by: Aviad Shimoni <[email protected]> * Update ray-operator/controllers/ray/raycluster_controller.go Co-authored-by: Rueian <[email protected]> Signed-off-by: Aviad Shimoni <[email protected]> * remove empty line --------- Signed-off-by: Aviad Shimoni <[email protected]> Co-authored-by: Rueian <[email protected]>
1 parent f123a44 commit 64da63b

File tree

3 files changed

+28
-5
lines changed

3 files changed

+28
-5
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,14 +1143,15 @@ func (r *RayClusterReconciler) buildWorkerPod(ctx context.Context, instance rayv
11431143
func (r *RayClusterReconciler) buildRedisCleanupJob(ctx context.Context, instance rayv1.RayCluster) batchv1.Job {
11441144
logger := ctrl.LoggerFrom(ctx)
11451145

1146+
// Build the head pod
11461147
pod := r.buildHeadPod(ctx, instance)
11471148
pod.Labels[utils.RayNodeTypeLabelKey] = string(rayv1.RedisCleanupNode)
11481149

11491150
// Only keep the Ray container in the Redis cleanup Job.
11501151
pod.Spec.Containers = []corev1.Container{pod.Spec.Containers[utils.RayContainerIndex]}
11511152
pod.Spec.Containers[utils.RayContainerIndex].Command = []string{"/bin/bash", "-lc", "--"}
11521153
pod.Spec.Containers[utils.RayContainerIndex].Args = []string{
1153-
"echo \"To get more information about manually delete the storage namespace in Redis and remove the RayCluster's finalizer, please check https://docs.ray.io/en/master/cluster/kubernetes/user-guides/kuberay-gcs-ft.html for more details.\" && " +
1154+
"echo \"To get more information about manually deleting the storage namespace in Redis and removing the RayCluster's finalizer, please check https://docs.ray.io/en/master/cluster/kubernetes/user-guides/kuberay-gcs-ft.html for more details.\" && " +
11541155
"python -c " +
11551156
"\"from ray._private.gcs_utils import cleanup_redis_storage; " +
11561157
"from urllib.parse import urlparse; " +
@@ -1180,8 +1181,8 @@ func (r *RayClusterReconciler) buildRedisCleanupJob(ctx context.Context, instanc
11801181
Value: "500",
11811182
})
11821183

1183-
// The container's resource consumption remains constant. so hard-coding the resources is acceptable.
1184-
// In addition, avoid using the GPU for the Redis cleanup Job.
1184+
// The container's resource consumption remains constant. Hard-coding the resources is acceptable.
1185+
// Avoid using the GPU for the Redis cleanup Job.
11851186
pod.Spec.Containers[utils.RayContainerIndex].Resources = corev1.ResourceRequirements{
11861187
Limits: corev1.ResourceList{
11871188
corev1.ResourceCPU: resource.MustParse("200m"),
@@ -1196,9 +1197,12 @@ func (r *RayClusterReconciler) buildRedisCleanupJob(ctx context.Context, instanc
11961197
// For Kubernetes Job, the valid values for Pod's `RestartPolicy` are `Never` and `OnFailure`.
11971198
pod.Spec.RestartPolicy = corev1.RestartPolicyNever
11981199

1200+
// Trim the job name to ensure it is within the 63-character limit.
1201+
jobName := utils.TrimJobName(fmt.Sprintf("%s-%s", instance.Name, "redis-cleanup"))
1202+
11991203
redisCleanupJob := batchv1.Job{
12001204
ObjectMeta: metav1.ObjectMeta{
1201-
Name: fmt.Sprintf("%s-%s", instance.Name, "redis-cleanup"),
1205+
Name: jobName,
12021206
Namespace: instance.Namespace,
12031207
Labels: pod.Labels,
12041208
Annotations: pod.Annotations,
@@ -1209,7 +1213,7 @@ func (r *RayClusterReconciler) buildRedisCleanupJob(ctx context.Context, instanc
12091213
ObjectMeta: pod.ObjectMeta,
12101214
Spec: pod.Spec,
12111215
},
1212-
// make this job be best-effort only for 5 minutes.
1216+
// Make this job best-effort only for 5 minutes.
12131217
ActiveDeadlineSeconds: ptr.To[int64](300),
12141218
},
12151219
}

ray-operator/controllers/ray/utils/util.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,11 @@ func CheckName(s string) string {
205205
return s
206206
}
207207

208+
// TrimJobName uses CheckLabel to trim Kubernetes job to constrains
209+
func TrimJobName(jobName string) string {
210+
return CheckLabel(jobName)
211+
}
212+
208213
// CheckLabel makes sure the label value does not start with a punctuation and the total length is < 63 char
209214
func CheckLabel(s string) string {
210215
maxLenght := 63

ray-operator/test/e2e/raycluster_gcsft_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,20 @@ func TestGcsFaultToleranceOptions(t *testing.T) {
8989
},
9090
createSecret: true,
9191
},
92+
{
93+
name: "Long RayCluster Name",
94+
redisPassword: "",
95+
rayClusterFn: func(namespace string) *rayv1ac.RayClusterApplyConfiguration {
96+
// Intentionally using a long name to test job name trimming
97+
return rayv1ac.RayCluster("raycluster-with-a-very-long-name-exceeding-k8s-limit", namespace).WithSpec(
98+
newRayClusterSpec().WithGcsFaultToleranceOptions(
99+
rayv1ac.GcsFaultToleranceOptions().
100+
WithRedisAddress("redis:6379"),
101+
),
102+
)
103+
},
104+
createSecret: false,
105+
},
92106
}
93107

94108
for _, tc := range testCases {

0 commit comments

Comments
 (0)