project-codeflare
diff --git a/‎src/codeflare_sdk/ray/rayjobs/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/codeflare_sdk/ray/rayjobs/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/codeflare_sdk/ray/rayjobs/cluster_config.py‎
Lines changed: 286 additions & 0 deletions b/‎src/codeflare_sdk/ray/rayjobs/cluster_config.py‎
Lines changed: 286 additions & 0 deletions
diff --git a/‎src/codeflare_sdk/ray/rayjobs/config.py‎
Lines changed: 69 additions & 1 deletion b/‎src/codeflare_sdk/ray/rayjobs/config.py‎
Lines changed: 69 additions & 1 deletion
@@ -1,2 +1,3 @@
 from .rayjob import RayJob, RayJobClusterConfig
 from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo
+from .cluster_config import RayJobClusterConfig
@@ -0,0 +1,286 @@
+# Copyright 2025 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+RayJobClusterConfig provides a focused configuration class for creating Ray clusters
+as part of RayJob submissions. This class maps directly to the KubeRay RayClusterSpec
+structure and removes legacy fields that aren't relevant for RayJob-based cluster creation.
+"""
+
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Union
+
+from codeflare_sdk.common.utils.constants import CUDA_RUNTIME_IMAGE
+
+
+@dataclass
+class RayJobClusterConfig:
+    """
+    Configuration for creating a RayCluster as part of a RayJob submission.
+
+    This class provides a clean, focused interface with sensible defaults that covers
+    95% of use cases. For advanced configurations, users can still access the underlying
+    Kubernetes objects.
+
+    Args:
+        num_workers: Number of worker nodes to create
+        head_cpu: CPU requests/limits for head node (e.g., "2" or 2)
+        head_memory: Memory requests/limits for head node (e.g., "8Gi" or 8)
+        gpu: GPU resources for head node (e.g., {"nvidia.com/gpu": 1})
+        worker_cpu: CPU requests/limits for each worker (e.g., "1" or 1)
+        worker_memory: Memory requests/limits for each worker (e.g., "4Gi" or 4)
+        worker_gpu: GPU resources for each worker (e.g., {"nvidia.com/gpu": 1})
+        image: Container image for all nodes (defaults to CUDA-enabled Ray image)
+
+        # Advanced options (optional)
+        envs: Additional environment variables to set on all pods
+        enable_gcs_ft: Whether to enable GCS fault tolerance
+        redis_address: Redis address for GCS fault tolerance (required if enable_gcs_ft=True)
+
+        # Lifecycle management
+        shutdown_after_job_finishes: Whether to automatically cleanup the cluster after job completion
+        ttl_seconds_after_finished: Seconds to wait before cleanup after job finishes
+        active_deadline_seconds: Maximum time the job can run before being terminated
+    """
+
+    num_workers: int = 1
+    head_cpu: Union[int, str] = 2
+    head_memory: Union[int, str] = 8
+    gpu: Dict[str, Union[int, str]] = field(default_factory=dict)
+    worker_cpu: Union[int, str] = 1
+    worker_memory: Union[int, str] = 4
+    worker_gpu: Dict[str, Union[int, str]] = field(default_factory=dict)
+    image: str = CUDA_RUNTIME_IMAGE  # Use CUDA-enabled Ray image
+
+    # Advanced options
+    envs: Dict[str, str] = field(default_factory=dict)
+    enable_gcs_ft: bool = False
+    redis_address: Optional[str] = None
+
+    # Lifecycle management
+    shutdown_after_job_finishes: bool = True
+    ttl_seconds_after_finished: int = 0
+    active_deadline_seconds: Optional[int] = None
+
+    def __post_init__(self):
+        """Post-initialization validation and setup."""
+        self._validate_config()
+        self._normalize_resources()
+        self._setup_gcs_ft()
+        self._setup_default_storage()
+        self._setup_default_labels()
+        self._setup_default_ray_params()
+
+    def _validate_gpu_resources(self):
+        """Validate GPU resource specifications."""
+        # Validate head GPU resources
+        for gpu_type, gpu_count in self.gpu.items():
+            if not isinstance(gpu_count, (int, str)):
+                raise ValueError(
+                    f"GPU count for {gpu_type} must be int or str, got {type(gpu_count)}"
+                )
+            if isinstance(gpu_count, str) and not gpu_count.isdigit():
+                raise ValueError(
+                    f"GPU count string for {gpu_type} must be numeric, got {gpu_count}"
+                )
+
+        # Validate worker GPU resources
+        for gpu_type, gpu_count in self.worker_gpu.items():
+            if not isinstance(gpu_count, (int, str)):
+                raise ValueError(
+                    f"GPU count for {gpu_type} must be int or str, got {type(gpu_count)}"
+                )
+            if isinstance(gpu_count, str) and not gpu_count.isdigit():
+                raise ValueError(
+                    f"GPU count string for {gpu_type} must be numeric, got {gpu_count}"
+                )
+
+    def _validate_config(self):
+        """Validate configuration parameters."""
+        if self.enable_gcs_ft and not self.redis_address:
+            raise ValueError(
+                "redis_address must be provided when enable_gcs_ft is True"
+            )
+
+        # GCS FT validation simplified - only redis_address is required
+        if self.num_workers < 0:
+            raise ValueError("num_workers cannot be negative")
+
+        # Validate GPU resources
+        self._validate_gpu_resources()
+
+    def _normalize_resources(self):
+        """Normalize resource specifications to string format."""
+        # Convert head resources
+        if isinstance(self.head_cpu, int):
+            self.head_cpu = str(self.head_cpu)
+        if isinstance(self.head_memory, int):
+            self.head_memory = f"{self.head_memory}Gi"
+
+        # Convert worker resources
+        if isinstance(self.worker_cpu, int):
+            self.worker_cpu = str(self.worker_cpu)
+        if isinstance(self.worker_memory, int):
+            self.worker_memory = f"{self.worker_memory}Gi"
+
+    def _setup_gcs_ft(self):
+        """Setup GCS fault tolerance environment variables."""
+        if self.enable_gcs_ft:
+            self.envs["RAY_GCS_FT_ENABLED"] = "true"
+            if self.redis_address:
+                self.envs["RAY_REDIS_ADDRESS"] = self.redis_address
+
+    def _setup_default_storage(self):
+        """Setup default storage - simplified for most use cases."""
+        # Most users don't need custom volumes/mounts
+        pass
+
+    def _setup_default_labels(self):
+        """Setup default labels for the cluster."""
+        # Simplified labels - most users don't need custom ones
+        pass
+
+    def _setup_default_ray_params(self):
+        """Setup default Ray start parameters for all nodes."""
+        # These will be used in the to_dict() method
+        self._default_head_params = {
+            "dashboard-host": "0.0.0.0",
+            "dashboard-port": "8265",
+            "block": "true",
+        }
+
+        self._default_worker_params = {
+            "block": "true",
+        }
+
+    def to_dict(self) -> Dict:
+        """
+        Convert the configuration to a dictionary that can be used
+        to create the RayClusterSpec for a RayJob.
+
+        Returns:
+            Dictionary representation suitable for RayJob rayClusterSpec
+        """
+        config_dict = {
+            "rayVersion": "2.9.0",  # Use stable version
+            "headGroupSpec": self._build_head_group_spec(),
+        }
+
+        if self.num_workers > 0:
+            config_dict["workerGroupSpecs"] = [self._build_worker_group_spec()]
+
+        if self.enable_gcs_ft:
+            config_dict["gcsFaultToleranceOptions"] = self._build_gcs_ft_options()
+
+        return config_dict
+
+    def _build_head_group_spec(self) -> Dict:
+        """Build the HeadGroupSpec for the RayCluster."""
+        head_spec = {
+            "template": self._build_pod_template(
+                cpu=self.head_cpu,
+                memory=self.head_memory,
+                gpu=self.gpu,
+                image=self.image,
+                is_head=True,
+            ),
+            "rayStartParams": self._default_head_params,
+            "serviceType": "ClusterIP",  # Always use ClusterIP
+        }
+
+        return head_spec
+
+    def _build_worker_group_spec(self) -> Dict:
+        """Build the WorkerGroupSpec for the RayCluster."""
+        worker_spec = {
+            "groupName": "default-worker-group",
+            "replicas": self.num_workers,
+            "template": self._build_pod_template(
+                cpu=self.worker_cpu,
+                memory=self.worker_memory,
+                gpu=self.worker_gpu,
+                image=self.image,
+                is_head=False,
+            ),
+            "rayStartParams": self._default_worker_params,
+        }
+
+        return worker_spec
+
+    def _build_pod_template(
+        self,
+        cpu: str,
+        memory: str,
+        gpu: Dict[str, Union[int, str]],
+        image: str,
+        is_head: bool,
+    ) -> Dict:
+        """Build a pod template specification."""
+        # Build resource requests and limits
+        resources = {
+            "requests": {
+                "cpu": cpu,
+                "memory": memory,
+            },
+            "limits": {
+                "cpu": cpu,
+                "memory": memory,
+            },
+        }
+
+        # Add GPU resources if specified
+        for gpu_type, gpu_count in gpu.items():
+            resources["requests"][gpu_type] = str(gpu_count)
+            resources["limits"][gpu_type] = str(gpu_count)
+
+        # Build container spec
+        container = {
+            "name": "ray-head" if is_head else "ray-worker",
+            "image": image,
+            "imagePullPolicy": "IfNotPresent",
+            "resources": resources,
+            "env": [{"name": k, "value": v} for k, v in self.envs.items()],
+        }
+
+        # Add head node specific configuration
+        if is_head:
+            container["ports"] = [
+                {"name": "gcs", "containerPort": 6379},
+                {"name": "dashboard", "containerPort": 8265},
+                {"name": "client", "containerPort": 10001},
+            ]
+            container["lifecycle"] = {
+                "preStop": {"exec": {"command": ["/bin/sh", "-c", "ray stop"]}}
+            }
+        else:
+            # Add worker lifecycle hook
+            container["lifecycle"] = {
+                "preStop": {"exec": {"command": ["/bin/sh", "-c", "ray stop"]}}
+            }
+
+        # Build pod template - simplified for most use cases
+        pod_template = {
+            "spec": {
+                "containers": [container],
+                "restartPolicy": "Never",  # RayJobs manage lifecycle, so Never is appropriate
+            }
+        }
+
+        return pod_template
+
+    def _build_gcs_ft_options(self) -> Dict:
+        """Build GCS fault tolerance options for the RayCluster."""
+        return {
+            "redisAddress": self.redis_address,
+        }
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """
-The config sub-module contains the definition of the RayJobClusterConfigV2 dataclass,
+The config sub-module contains the definition of the RayJobClusterConfig dataclass,
 which is used to specify resource requirements and other details when creating a
 Cluster object.
 """
@@ -139,6 +139,16 @@ class RayJobClusterConfig:
             A list of V1Volume objects to add to the Cluster
         volume_mounts:
             A list of V1VolumeMount objects to add to the Cluster
+        enable_gcs_ft:
+            A boolean indicating whether to enable GCS fault tolerance.
+        enable_usage_stats:
+            A boolean indicating whether to capture and send Ray usage stats externally.
+        redis_address:
+            The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True.
+        redis_password_secret:
+            Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
+        external_storage_namespace:
+            The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
     """
 
     head_cpu_requests: Union[int, str] = 2
@@ -165,8 +175,39 @@ class RayJobClusterConfig:
     annotations: Dict[str, str] = field(default_factory=dict)
     volumes: list[V1Volume] = field(default_factory=list)
     volume_mounts: list[V1VolumeMount] = field(default_factory=list)
+    enable_gcs_ft: bool = False
+    enable_usage_stats: bool = False
+    redis_address: Optional[str] = None
+    redis_password_secret: Optional[Dict[str, str]] = None
+    external_storage_namespace: Optional[str] = None
 
     def __post_init__(self):
+        if self.enable_usage_stats:
+            self.envs["RAY_USAGE_STATS_ENABLED"] = "1"
+        else:
+            self.envs["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        if self.enable_gcs_ft:
+            if not self.redis_address:
+                raise ValueError(
+                    "redis_address must be provided when enable_gcs_ft is True"
+                )
+
+            if self.redis_password_secret and not isinstance(
+                self.redis_password_secret, dict
+            ):
+                raise ValueError(
+                    "redis_password_secret must be a dictionary with 'name' and 'key' fields"
+                )
+
+            if self.redis_password_secret and (
+                "name" not in self.redis_password_secret
+                or "key" not in self.redis_password_secret
+            ):
+                raise ValueError(
+                    "redis_password_secret must contain both 'name' and 'key' fields"
+                )
+
         self._validate_types()
         self._memory_to_string()
         self._validate_gpu_config(self.head_accelerators)
@@ -251,6 +292,11 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]:
             "workerGroupSpecs": [self._build_worker_group_spec(cluster_name)],
         }
 
+        # Add GCS fault tolerance if enabled
+        if self.enable_gcs_ft:
+            gcs_ft_options = self._build_gcs_ft_options()
+            ray_cluster_spec["gcsFaultToleranceOptions"] = gcs_ft_options
+
         return ray_cluster_spec
 
     def _build_head_group_spec(self) -> Dict[str, Any]:
@@ -453,3 +499,25 @@ def _generate_volumes(self) -> list:
     def _build_env_vars(self) -> list:
         """Build environment variables list."""
         return [V1EnvVar(name=key, value=value) for key, value in self.envs.items()]
+
+    def _build_gcs_ft_options(self) -> Dict[str, Any]:
+        """Build GCS fault tolerance options."""
+        gcs_ft_options = {"redisAddress": self.redis_address}
+
+        if (
+            hasattr(self, "external_storage_namespace")
+            and self.external_storage_namespace
+        ):
+            gcs_ft_options["externalStorageNamespace"] = self.external_storage_namespace
+
+        if hasattr(self, "redis_password_secret") and self.redis_password_secret:
+            gcs_ft_options["redisPassword"] = {
+                "valueFrom": {
+                    "secretKeyRef": {
+                        "name": self.redis_password_secret["name"],
+                        "key": self.redis_password_secret["key"],
+                    }
+                }
+            }
+
+        return gcs_ft_options
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from .rayjob import RayJob, RayJobClusterConfig`
`2`	`2`	`from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo`
	`3`	`+from .cluster_config import RayJobClusterConfig`