Skip to content

Commit 641653b

Browse files
committed
RHOAIENG-30720: Remove GCS FT for Lifecycled RayClusters
1 parent 3489a6b commit 641653b

File tree

2 files changed

+2
-83
lines changed

2 files changed

+2
-83
lines changed

src/codeflare_sdk/ray/rayjobs/config.py

Lines changed: 2 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -139,20 +139,12 @@ class ManagedClusterConfig:
139139
A list of V1Volume objects to add to the Cluster
140140
volume_mounts:
141141
A list of V1VolumeMount objects to add to the Cluster
142-
enable_gcs_ft:
143-
A boolean indicating whether to enable GCS fault tolerance.
144-
redis_address:
145-
The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True.
146-
redis_password_secret:
147-
Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
148-
external_storage_namespace:
149-
The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
150142
"""
151143

152144
head_cpu_requests: Union[int, str] = 2
153145
head_cpu_limits: Union[int, str] = 2
154-
head_memory_requests: Union[int, str] = 8
155-
head_memory_limits: Union[int, str] = 8
146+
head_memory_requests: Union[int, str] = 2
147+
head_memory_limits: Union[int, str] = 2
156148
head_accelerators: Dict[str, Union[str, int]] = field(default_factory=dict)
157149
head_tolerations: Optional[List[V1Toleration]] = None
158150
worker_cpu_requests: Union[int, str] = 1
@@ -173,35 +165,10 @@ class ManagedClusterConfig:
173165
annotations: Dict[str, str] = field(default_factory=dict)
174166
volumes: list[V1Volume] = field(default_factory=list)
175167
volume_mounts: list[V1VolumeMount] = field(default_factory=list)
176-
enable_gcs_ft: bool = False
177-
redis_address: Optional[str] = None
178-
redis_password_secret: Optional[Dict[str, str]] = None
179-
external_storage_namespace: Optional[str] = None
180168

181169
def __post_init__(self):
182170
self.envs["RAY_USAGE_STATS_ENABLED"] = "0"
183171

184-
if self.enable_gcs_ft:
185-
if not self.redis_address:
186-
raise ValueError(
187-
"redis_address must be provided when enable_gcs_ft is True"
188-
)
189-
190-
if self.redis_password_secret and not isinstance(
191-
self.redis_password_secret, dict
192-
):
193-
raise ValueError(
194-
"redis_password_secret must be a dictionary with 'name' and 'key' fields"
195-
)
196-
197-
if self.redis_password_secret and (
198-
"name" not in self.redis_password_secret
199-
or "key" not in self.redis_password_secret
200-
):
201-
raise ValueError(
202-
"redis_password_secret must contain both 'name' and 'key' fields"
203-
)
204-
205172
self._validate_types()
206173
self._memory_to_string()
207174
self._validate_gpu_config(self.head_accelerators)
@@ -286,11 +253,6 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]:
286253
"workerGroupSpecs": [self._build_worker_group_spec(cluster_name)],
287254
}
288255

289-
# Add GCS fault tolerance if enabled
290-
if self.enable_gcs_ft:
291-
gcs_ft_options = self._build_gcs_ft_options()
292-
ray_cluster_spec["gcsFaultToleranceOptions"] = gcs_ft_options
293-
294256
return ray_cluster_spec
295257

296258
def _build_head_group_spec(self) -> Dict[str, Any]:
@@ -493,25 +455,3 @@ def _generate_volumes(self) -> list:
493455
def _build_env_vars(self) -> list:
494456
"""Build environment variables list."""
495457
return [V1EnvVar(name=key, value=value) for key, value in self.envs.items()]
496-
497-
def _build_gcs_ft_options(self) -> Dict[str, Any]:
498-
"""Build GCS fault tolerance options."""
499-
gcs_ft_options = {"redisAddress": self.redis_address}
500-
501-
if (
502-
hasattr(self, "external_storage_namespace")
503-
and self.external_storage_namespace
504-
):
505-
gcs_ft_options["externalStorageNamespace"] = self.external_storage_namespace
506-
507-
if hasattr(self, "redis_password_secret") and self.redis_password_secret:
508-
gcs_ft_options["redisPassword"] = {
509-
"valueFrom": {
510-
"secretKeyRef": {
511-
"name": self.redis_password_secret["name"],
512-
"key": self.redis_password_secret["key"],
513-
}
514-
}
515-
}
516-
517-
return gcs_ft_options

src/codeflare_sdk/ray/rayjobs/test_rayjob.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -974,27 +974,6 @@ def test_rayjob_user_override_shutdown_behavior(mocker):
974974
assert rayjob_override_priority.shutdown_after_job_finishes is True
975975

976976

977-
def test_build_ray_cluster_spec_with_gcs_ft(mocker):
978-
"""Test build_ray_cluster_spec with GCS fault tolerance enabled."""
979-
from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig
980-
981-
# Create a test cluster config with GCS FT enabled
982-
cluster_config = ManagedClusterConfig(
983-
enable_gcs_ft=True,
984-
redis_address="redis://redis-service:6379",
985-
external_storage_namespace="storage-ns",
986-
)
987-
988-
# Build the spec using the method on the cluster config
989-
spec = cluster_config.build_ray_cluster_spec("test-cluster")
990-
991-
# Verify GCS fault tolerance options
992-
assert "gcsFaultToleranceOptions" in spec
993-
gcs_ft = spec["gcsFaultToleranceOptions"]
994-
assert gcs_ft["redisAddress"] == "redis://redis-service:6379"
995-
assert gcs_ft["externalStorageNamespace"] == "storage-ns"
996-
997-
998977
class TestRayVersionValidation:
999978
"""Test Ray version validation in RayJob."""
1000979

0 commit comments

Comments
 (0)