Skip to content

Commit f4913e9

Browse files
committed
RHOAIENG-30720: Remove GCS FT for Lifecycled RayClusters
1 parent 434027d commit f4913e9

File tree

4 files changed

+8
-88
lines changed

4 files changed

+8
-88
lines changed

pyproject.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
2-
name = "codeflare-sdk"
3-
version = "0.0.0-dev"
2+
name = "codeflare-sdk-dev"
3+
version = "0.0.1.dev1"
44
description = "Python SDK for codeflare client"
55

66
license = "Apache-2.0"
@@ -19,6 +19,8 @@ homepage = "https://github.com/project-codeflare/codeflare-sdk"
1919

2020
keywords = ['codeflare', 'python', 'sdk', 'client', 'batch', 'scale']
2121

22+
packages = [{include = "codeflare_sdk", from = "src"}]
23+
2224
[tool.poetry.dependencies]
2325
python = "^3.11"
2426
openshift-client = "1.0.18"
@@ -29,7 +31,8 @@ cryptography = "43.0.3"
2931
executing = "1.2.0"
3032
pydantic = "< 2"
3133
ipywidgets = "8.1.2"
32-
python-client = { git = "https://github.com/ray-project/kuberay.git", subdirectory = "clients/python-client", rev = "d1e750d9beac612ad455b951c1a789f971409ab3" }
34+
# Temporarily commented out for test.pypi publishing (git dependencies not allowed)
35+
# python-client = { git = "https://github.com/ray-project/kuberay.git", subdirectory = "clients/python-client", rev = "4ff766f695cb66c5538072a07723b5e1df3deaa9" }
3336

3437
[[tool.poetry.source]]
3538
name = "pypi"

src/codeflare_sdk/ray/rayjobs/config.py

Lines changed: 2 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -139,20 +139,12 @@ class ManagedClusterConfig:
139139
A list of V1Volume objects to add to the Cluster
140140
volume_mounts:
141141
A list of V1VolumeMount objects to add to the Cluster
142-
enable_gcs_ft:
143-
A boolean indicating whether to enable GCS fault tolerance.
144-
redis_address:
145-
The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True.
146-
redis_password_secret:
147-
Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
148-
external_storage_namespace:
149-
The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
150142
"""
151143

152144
head_cpu_requests: Union[int, str] = 2
153145
head_cpu_limits: Union[int, str] = 2
154-
head_memory_requests: Union[int, str] = 8
155-
head_memory_limits: Union[int, str] = 8
146+
head_memory_requests: Union[int, str] = 2
147+
head_memory_limits: Union[int, str] = 2
156148
head_accelerators: Dict[str, Union[str, int]] = field(default_factory=dict)
157149
head_tolerations: Optional[List[V1Toleration]] = None
158150
worker_cpu_requests: Union[int, str] = 1
@@ -173,35 +165,10 @@ class ManagedClusterConfig:
173165
annotations: Dict[str, str] = field(default_factory=dict)
174166
volumes: list[V1Volume] = field(default_factory=list)
175167
volume_mounts: list[V1VolumeMount] = field(default_factory=list)
176-
enable_gcs_ft: bool = False
177-
redis_address: Optional[str] = None
178-
redis_password_secret: Optional[Dict[str, str]] = None
179-
external_storage_namespace: Optional[str] = None
180168

181169
def __post_init__(self):
182170
self.envs["RAY_USAGE_STATS_ENABLED"] = "0"
183171

184-
if self.enable_gcs_ft:
185-
if not self.redis_address:
186-
raise ValueError(
187-
"redis_address must be provided when enable_gcs_ft is True"
188-
)
189-
190-
if self.redis_password_secret and not isinstance(
191-
self.redis_password_secret, dict
192-
):
193-
raise ValueError(
194-
"redis_password_secret must be a dictionary with 'name' and 'key' fields"
195-
)
196-
197-
if self.redis_password_secret and (
198-
"name" not in self.redis_password_secret
199-
or "key" not in self.redis_password_secret
200-
):
201-
raise ValueError(
202-
"redis_password_secret must contain both 'name' and 'key' fields"
203-
)
204-
205172
self._validate_types()
206173
self._memory_to_string()
207174
self._validate_gpu_config(self.head_accelerators)
@@ -286,11 +253,6 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]:
286253
"workerGroupSpecs": [self._build_worker_group_spec(cluster_name)],
287254
}
288255

289-
# Add GCS fault tolerance if enabled
290-
if self.enable_gcs_ft:
291-
gcs_ft_options = self._build_gcs_ft_options()
292-
ray_cluster_spec["gcsFaultToleranceOptions"] = gcs_ft_options
293-
294256
return ray_cluster_spec
295257

296258
def _build_head_group_spec(self) -> Dict[str, Any]:
@@ -494,28 +456,6 @@ def _build_env_vars(self) -> list:
494456
"""Build environment variables list."""
495457
return [V1EnvVar(name=key, value=value) for key, value in self.envs.items()]
496458

497-
def _build_gcs_ft_options(self) -> Dict[str, Any]:
498-
"""Build GCS fault tolerance options."""
499-
gcs_ft_options = {"redisAddress": self.redis_address}
500-
501-
if (
502-
hasattr(self, "external_storage_namespace")
503-
and self.external_storage_namespace
504-
):
505-
gcs_ft_options["externalStorageNamespace"] = self.external_storage_namespace
506-
507-
if hasattr(self, "redis_password_secret") and self.redis_password_secret:
508-
gcs_ft_options["redisPassword"] = {
509-
"valueFrom": {
510-
"secretKeyRef": {
511-
"name": self.redis_password_secret["name"],
512-
"key": self.redis_password_secret["key"],
513-
}
514-
}
515-
}
516-
517-
return gcs_ft_options
518-
519459
def add_script_volumes(
520460
self, configmap_name: str, mount_path: str = "/home/ray/scripts"
521461
):

src/codeflare_sdk/ray/rayjobs/rayjob.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -541,8 +541,6 @@ def _update_existing_cluster_for_scripts(
541541
config_builder: ManagedClusterConfig instance for building specs
542542
"""
543543

544-
# Get existing RayCluster
545-
api_instance = client.CustomObjectsApi(get_api_client())
546544
try:
547545
ray_cluster = self._cluster_api.get_ray_cluster(
548546
name=self.cluster_name,

src/codeflare_sdk/ray/rayjobs/test_rayjob.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,27 +1007,6 @@ def test_rayjob_user_override_shutdown_behavior(mocker):
10071007
assert rayjob_override_priority.shutdown_after_job_finishes is True
10081008

10091009

1010-
def test_build_ray_cluster_spec_with_gcs_ft(mocker):
1011-
"""Test build_ray_cluster_spec with GCS fault tolerance enabled."""
1012-
from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig
1013-
1014-
# Create a test cluster config with GCS FT enabled
1015-
cluster_config = ManagedClusterConfig(
1016-
enable_gcs_ft=True,
1017-
redis_address="redis://redis-service:6379",
1018-
external_storage_namespace="storage-ns",
1019-
)
1020-
1021-
# Build the spec using the method on the cluster config
1022-
spec = cluster_config.build_ray_cluster_spec("test-cluster")
1023-
1024-
# Verify GCS fault tolerance options
1025-
assert "gcsFaultToleranceOptions" in spec
1026-
gcs_ft = spec["gcsFaultToleranceOptions"]
1027-
assert gcs_ft["redisAddress"] == "redis://redis-service:6379"
1028-
assert gcs_ft["externalStorageNamespace"] == "storage-ns"
1029-
1030-
10311010
class TestRayVersionValidation:
10321011
"""Test Ray version validation in RayJob."""
10331012

0 commit comments

Comments
 (0)