@@ -139,20 +139,12 @@ class ManagedClusterConfig:
139139 A list of V1Volume objects to add to the Cluster
140140 volume_mounts:
141141 A list of V1VolumeMount objects to add to the Cluster
142- enable_gcs_ft:
143- A boolean indicating whether to enable GCS fault tolerance.
144- redis_address:
145- The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True.
146- redis_password_secret:
147- Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
148- external_storage_namespace:
149- The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
150142 """
151143
152144 head_cpu_requests : Union [int , str ] = 2
153145 head_cpu_limits : Union [int , str ] = 2
154- head_memory_requests : Union [int , str ] = 8
155- head_memory_limits : Union [int , str ] = 8
146+ head_memory_requests : Union [int , str ] = 2
147+ head_memory_limits : Union [int , str ] = 2
156148 head_accelerators : Dict [str , Union [str , int ]] = field (default_factory = dict )
157149 head_tolerations : Optional [List [V1Toleration ]] = None
158150 worker_cpu_requests : Union [int , str ] = 1
@@ -173,35 +165,10 @@ class ManagedClusterConfig:
173165 annotations : Dict [str , str ] = field (default_factory = dict )
174166 volumes : list [V1Volume ] = field (default_factory = list )
175167 volume_mounts : list [V1VolumeMount ] = field (default_factory = list )
176- enable_gcs_ft : bool = False
177- redis_address : Optional [str ] = None
178- redis_password_secret : Optional [Dict [str , str ]] = None
179- external_storage_namespace : Optional [str ] = None
180168
181169 def __post_init__ (self ):
182170 self .envs ["RAY_USAGE_STATS_ENABLED" ] = "0"
183171
184- if self .enable_gcs_ft :
185- if not self .redis_address :
186- raise ValueError (
187- "redis_address must be provided when enable_gcs_ft is True"
188- )
189-
190- if self .redis_password_secret and not isinstance (
191- self .redis_password_secret , dict
192- ):
193- raise ValueError (
194- "redis_password_secret must be a dictionary with 'name' and 'key' fields"
195- )
196-
197- if self .redis_password_secret and (
198- "name" not in self .redis_password_secret
199- or "key" not in self .redis_password_secret
200- ):
201- raise ValueError (
202- "redis_password_secret must contain both 'name' and 'key' fields"
203- )
204-
205172 self ._validate_types ()
206173 self ._memory_to_string ()
207174 self ._validate_gpu_config (self .head_accelerators )
@@ -286,11 +253,6 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]:
286253 "workerGroupSpecs" : [self ._build_worker_group_spec (cluster_name )],
287254 }
288255
289- # Add GCS fault tolerance if enabled
290- if self .enable_gcs_ft :
291- gcs_ft_options = self ._build_gcs_ft_options ()
292- ray_cluster_spec ["gcsFaultToleranceOptions" ] = gcs_ft_options
293-
294256 return ray_cluster_spec
295257
296258 def _build_head_group_spec (self ) -> Dict [str , Any ]:
@@ -493,117 +455,3 @@ def _generate_volumes(self) -> list:
493455 def _build_env_vars (self ) -> list :
494456 """Build environment variables list."""
495457 return [V1EnvVar (name = key , value = value ) for key , value in self .envs .items ()]
496-
497- def _build_gcs_ft_options (self ) -> Dict [str , Any ]:
498- """Build GCS fault tolerance options."""
499- gcs_ft_options = {"redisAddress" : self .redis_address }
500-
501- if (
502- hasattr (self , "external_storage_namespace" )
503- and self .external_storage_namespace
504- ):
505- gcs_ft_options ["externalStorageNamespace" ] = self .external_storage_namespace
506-
507- if hasattr (self , "redis_password_secret" ) and self .redis_password_secret :
508- gcs_ft_options ["redisPassword" ] = {
509- "valueFrom" : {
510- "secretKeyRef" : {
511- "name" : self .redis_password_secret ["name" ],
512- "key" : self .redis_password_secret ["key" ],
513- }
514- }
515- }
516-
517- return gcs_ft_options
518-
519- def add_script_volumes (
520- self , configmap_name : str , mount_path : str = "/home/ray/scripts"
521- ):
522- """
523- Add script volume and mount references to cluster configuration.
524-
525- Args:
526- configmap_name: Name of the ConfigMap containing scripts
527- mount_path: Where to mount scripts in containers (default: /home/ray/scripts)
528- """
529- # Check if script volume already exists
530- volume_name = "ray-job-scripts"
531- existing_volume = next (
532- (v for v in self .volumes if getattr (v , "name" , None ) == volume_name ), None
533- )
534- if existing_volume :
535- logger .debug (f"Script volume '{ volume_name } ' already exists, skipping..." )
536- return
537-
538- # Check if script mount already exists
539- existing_mount = next (
540- (m for m in self .volume_mounts if getattr (m , "name" , None ) == volume_name ),
541- None ,
542- )
543- if existing_mount :
544- logger .debug (
545- f"Script volume mount '{ volume_name } ' already exists, skipping..."
546- )
547- return
548-
549- # Add script volume to cluster configuration
550- script_volume = V1Volume (
551- name = volume_name , config_map = V1ConfigMapVolumeSource (name = configmap_name )
552- )
553- self .volumes .append (script_volume )
554-
555- # Add script volume mount to cluster configuration
556- script_mount = V1VolumeMount (name = volume_name , mount_path = mount_path )
557- self .volume_mounts .append (script_mount )
558-
559- logger .info (
560- f"Added script volume '{ configmap_name } ' to cluster config: mount_path={ mount_path } "
561- )
562-
563- def validate_configmap_size (self , scripts : Dict [str , str ]) -> None :
564- total_size = sum (len (content .encode ("utf-8" )) for content in scripts .values ())
565- if total_size > 1024 * 1024 : # 1MB
566- raise ValueError (
567- f"ConfigMap size exceeds 1MB limit. Total size: { total_size } bytes"
568- )
569-
570- def build_script_configmap_spec (
571- self , job_name : str , namespace : str , scripts : Dict [str , str ]
572- ) -> Dict [str , Any ]:
573- """
574- Build ConfigMap specification for scripts
575-
576- Args:
577- job_name: Name of the RayJob (used for ConfigMap naming)
578- namespace: Kubernetes namespace
579- scripts: Dictionary of script_name -> script_content
580-
581- Returns:
582- Dict: ConfigMap specification ready for Kubernetes API
583- """
584- configmap_name = f"{ job_name } -scripts"
585- return {
586- "apiVersion" : "v1" ,
587- "kind" : "ConfigMap" ,
588- "metadata" : {"name" : configmap_name , "namespace" : namespace },
589- "data" : scripts ,
590- }
591-
592- def build_script_volume_specs (
593- self , configmap_name : str , mount_path : str = "/home/ray/scripts"
594- ) -> Tuple [Dict [str , Any ], Dict [str , Any ]]:
595- """
596- Build volume and mount specifications for scripts
597-
598- Args:
599- configmap_name: Name of the ConfigMap containing scripts
600- mount_path: Where to mount scripts in containers
601-
602- Returns:
603- Tuple of (volume_spec, mount_spec) as dictionaries
604- """
605- volume_spec = {"name" : "ray-job-scripts" , "configMap" : {"name" : configmap_name }}
606-
607- mount_spec = {"name" : "ray-job-scripts" , "mountPath" : mount_path }
608-
609- return volume_spec , mount_spec
0 commit comments