Skip to content

Commit af02d7b

Browse files
committed
RHOAIENG-30720: Remove GCS FT for Lifecycled RayClusters
1 parent 434027d commit af02d7b

File tree

2 files changed

+2
-175
lines changed

2 files changed

+2
-175
lines changed

src/codeflare_sdk/ray/rayjobs/config.py

Lines changed: 2 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -139,20 +139,12 @@ class ManagedClusterConfig:
139139
A list of V1Volume objects to add to the Cluster
140140
volume_mounts:
141141
A list of V1VolumeMount objects to add to the Cluster
142-
enable_gcs_ft:
143-
A boolean indicating whether to enable GCS fault tolerance.
144-
redis_address:
145-
The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True.
146-
redis_password_secret:
147-
Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
148-
external_storage_namespace:
149-
The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
150142
"""
151143

152144
head_cpu_requests: Union[int, str] = 2
153145
head_cpu_limits: Union[int, str] = 2
154-
head_memory_requests: Union[int, str] = 8
155-
head_memory_limits: Union[int, str] = 8
146+
head_memory_requests: Union[int, str] = 2
147+
head_memory_limits: Union[int, str] = 2
156148
head_accelerators: Dict[str, Union[str, int]] = field(default_factory=dict)
157149
head_tolerations: Optional[List[V1Toleration]] = None
158150
worker_cpu_requests: Union[int, str] = 1
@@ -173,35 +165,10 @@ class ManagedClusterConfig:
173165
annotations: Dict[str, str] = field(default_factory=dict)
174166
volumes: list[V1Volume] = field(default_factory=list)
175167
volume_mounts: list[V1VolumeMount] = field(default_factory=list)
176-
enable_gcs_ft: bool = False
177-
redis_address: Optional[str] = None
178-
redis_password_secret: Optional[Dict[str, str]] = None
179-
external_storage_namespace: Optional[str] = None
180168

181169
def __post_init__(self):
182170
self.envs["RAY_USAGE_STATS_ENABLED"] = "0"
183171

184-
if self.enable_gcs_ft:
185-
if not self.redis_address:
186-
raise ValueError(
187-
"redis_address must be provided when enable_gcs_ft is True"
188-
)
189-
190-
if self.redis_password_secret and not isinstance(
191-
self.redis_password_secret, dict
192-
):
193-
raise ValueError(
194-
"redis_password_secret must be a dictionary with 'name' and 'key' fields"
195-
)
196-
197-
if self.redis_password_secret and (
198-
"name" not in self.redis_password_secret
199-
or "key" not in self.redis_password_secret
200-
):
201-
raise ValueError(
202-
"redis_password_secret must contain both 'name' and 'key' fields"
203-
)
204-
205172
self._validate_types()
206173
self._memory_to_string()
207174
self._validate_gpu_config(self.head_accelerators)
@@ -286,11 +253,6 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]:
286253
"workerGroupSpecs": [self._build_worker_group_spec(cluster_name)],
287254
}
288255

289-
# Add GCS fault tolerance if enabled
290-
if self.enable_gcs_ft:
291-
gcs_ft_options = self._build_gcs_ft_options()
292-
ray_cluster_spec["gcsFaultToleranceOptions"] = gcs_ft_options
293-
294256
return ray_cluster_spec
295257

296258
def _build_head_group_spec(self) -> Dict[str, Any]:
@@ -493,117 +455,3 @@ def _generate_volumes(self) -> list:
493455
def _build_env_vars(self) -> list:
494456
"""Build environment variables list."""
495457
return [V1EnvVar(name=key, value=value) for key, value in self.envs.items()]
496-
497-
def _build_gcs_ft_options(self) -> Dict[str, Any]:
498-
"""Build GCS fault tolerance options."""
499-
gcs_ft_options = {"redisAddress": self.redis_address}
500-
501-
if (
502-
hasattr(self, "external_storage_namespace")
503-
and self.external_storage_namespace
504-
):
505-
gcs_ft_options["externalStorageNamespace"] = self.external_storage_namespace
506-
507-
if hasattr(self, "redis_password_secret") and self.redis_password_secret:
508-
gcs_ft_options["redisPassword"] = {
509-
"valueFrom": {
510-
"secretKeyRef": {
511-
"name": self.redis_password_secret["name"],
512-
"key": self.redis_password_secret["key"],
513-
}
514-
}
515-
}
516-
517-
return gcs_ft_options
518-
519-
def add_script_volumes(
520-
self, configmap_name: str, mount_path: str = "/home/ray/scripts"
521-
):
522-
"""
523-
Add script volume and mount references to cluster configuration.
524-
525-
Args:
526-
configmap_name: Name of the ConfigMap containing scripts
527-
mount_path: Where to mount scripts in containers (default: /home/ray/scripts)
528-
"""
529-
# Check if script volume already exists
530-
volume_name = "ray-job-scripts"
531-
existing_volume = next(
532-
(v for v in self.volumes if getattr(v, "name", None) == volume_name), None
533-
)
534-
if existing_volume:
535-
logger.debug(f"Script volume '{volume_name}' already exists, skipping...")
536-
return
537-
538-
# Check if script mount already exists
539-
existing_mount = next(
540-
(m for m in self.volume_mounts if getattr(m, "name", None) == volume_name),
541-
None,
542-
)
543-
if existing_mount:
544-
logger.debug(
545-
f"Script volume mount '{volume_name}' already exists, skipping..."
546-
)
547-
return
548-
549-
# Add script volume to cluster configuration
550-
script_volume = V1Volume(
551-
name=volume_name, config_map=V1ConfigMapVolumeSource(name=configmap_name)
552-
)
553-
self.volumes.append(script_volume)
554-
555-
# Add script volume mount to cluster configuration
556-
script_mount = V1VolumeMount(name=volume_name, mount_path=mount_path)
557-
self.volume_mounts.append(script_mount)
558-
559-
logger.info(
560-
f"Added script volume '{configmap_name}' to cluster config: mount_path={mount_path}"
561-
)
562-
563-
def validate_configmap_size(self, scripts: Dict[str, str]) -> None:
564-
total_size = sum(len(content.encode("utf-8")) for content in scripts.values())
565-
if total_size > 1024 * 1024: # 1MB
566-
raise ValueError(
567-
f"ConfigMap size exceeds 1MB limit. Total size: {total_size} bytes"
568-
)
569-
570-
def build_script_configmap_spec(
571-
self, job_name: str, namespace: str, scripts: Dict[str, str]
572-
) -> Dict[str, Any]:
573-
"""
574-
Build ConfigMap specification for scripts
575-
576-
Args:
577-
job_name: Name of the RayJob (used for ConfigMap naming)
578-
namespace: Kubernetes namespace
579-
scripts: Dictionary of script_name -> script_content
580-
581-
Returns:
582-
Dict: ConfigMap specification ready for Kubernetes API
583-
"""
584-
configmap_name = f"{job_name}-scripts"
585-
return {
586-
"apiVersion": "v1",
587-
"kind": "ConfigMap",
588-
"metadata": {"name": configmap_name, "namespace": namespace},
589-
"data": scripts,
590-
}
591-
592-
def build_script_volume_specs(
593-
self, configmap_name: str, mount_path: str = "/home/ray/scripts"
594-
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
595-
"""
596-
Build volume and mount specifications for scripts
597-
598-
Args:
599-
configmap_name: Name of the ConfigMap containing scripts
600-
mount_path: Where to mount scripts in containers
601-
602-
Returns:
603-
Tuple of (volume_spec, mount_spec) as dictionaries
604-
"""
605-
volume_spec = {"name": "ray-job-scripts", "configMap": {"name": configmap_name}}
606-
607-
mount_spec = {"name": "ray-job-scripts", "mountPath": mount_path}
608-
609-
return volume_spec, mount_spec

src/codeflare_sdk/ray/rayjobs/test_rayjob.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,27 +1007,6 @@ def test_rayjob_user_override_shutdown_behavior(mocker):
10071007
assert rayjob_override_priority.shutdown_after_job_finishes is True
10081008

10091009

1010-
def test_build_ray_cluster_spec_with_gcs_ft(mocker):
1011-
"""Test build_ray_cluster_spec with GCS fault tolerance enabled."""
1012-
from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig
1013-
1014-
# Create a test cluster config with GCS FT enabled
1015-
cluster_config = ManagedClusterConfig(
1016-
enable_gcs_ft=True,
1017-
redis_address="redis://redis-service:6379",
1018-
external_storage_namespace="storage-ns",
1019-
)
1020-
1021-
# Build the spec using the method on the cluster config
1022-
spec = cluster_config.build_ray_cluster_spec("test-cluster")
1023-
1024-
# Verify GCS fault tolerance options
1025-
assert "gcsFaultToleranceOptions" in spec
1026-
gcs_ft = spec["gcsFaultToleranceOptions"]
1027-
assert gcs_ft["redisAddress"] == "redis://redis-service:6379"
1028-
assert gcs_ft["externalStorageNamespace"] == "storage-ns"
1029-
1030-
10311010
class TestRayVersionValidation:
10321011
"""Test Ray version validation in RayJob."""
10331012

0 commit comments

Comments
 (0)