@@ -663,26 +663,31 @@ def run_job_with_managed_cluster(
663663 raise ValueError ("job_config.entrypoint must be specified." )
664664
665665 # Warn if Pydantic V1/V2 specific fields in RayJobSpec are set, as they are not used for RayJob CR.
666- if job_config .entrypoint_num_cpus is not None or \
667- job_config .entrypoint_num_gpus is not None or \
668- job_config .entrypoint_memory is not None :
666+ if (
667+ job_config .entrypoint_num_cpus is not None
668+ or job_config .entrypoint_num_gpus is not None
669+ or job_config .entrypoint_memory is not None
670+ ):
669671 warnings .warn (
670672 "RayJobSpec fields 'entrypoint_num_cpus', 'entrypoint_num_gpus', 'entrypoint_memory' "
671673 "are not directly used when creating a RayJob CR. They are primarily for the Ray Job Submission Client. "
672674 "Resource requests for the job driver pod should be configured in the RayCluster head node spec via ClusterConfiguration." ,
673- UserWarning
675+ UserWarning ,
674676 )
675677
676678 # Generate rayClusterSpec from ClusterConfiguration
677679 temp_config_for_spec = copy .deepcopy (cluster_config )
678680 temp_config_for_spec .appwrapper = False
679-
681+
680682 with warnings .catch_warnings ():
681683 warnings .simplefilter ("ignore" , UserWarning )
682684 dummy_cluster_for_spec = Cluster (temp_config_for_spec )
683685
684686 ray_cluster_cr_dict = dummy_cluster_for_spec .resource_yaml
685- if not isinstance (ray_cluster_cr_dict , dict ) or "spec" not in ray_cluster_cr_dict :
687+ if (
688+ not isinstance (ray_cluster_cr_dict , dict )
689+ or "spec" not in ray_cluster_cr_dict
690+ ):
686691 raise ValueError (
687692 "Failed to generate RayCluster CR dictionary from ClusterConfiguration. "
688693 f"Got: { type (ray_cluster_cr_dict )} "
@@ -691,13 +696,15 @@ def run_job_with_managed_cluster(
691696
692697 # Prepare RayJob CR
693698 actual_job_cr_name = job_cr_name or f"rayjob-{ uuid .uuid4 ().hex [:10 ]} "
694-
699+
695700 runtime_env_yaml_str = ""
696701 if job_config .runtime_env :
697702 try :
698703 runtime_env_yaml_str = yaml .dump (job_config .runtime_env )
699704 except yaml .YAMLError as e :
700- raise ValueError (f"Invalid job_config.runtime_env, failed to dump to YAML: { e } " )
705+ raise ValueError (
706+ f"Invalid job_config.runtime_env, failed to dump to YAML: { e } "
707+ )
701708
702709 ray_job_cr_spec = {
703710 "entrypoint" : job_config .entrypoint ,
@@ -735,7 +742,9 @@ def run_job_with_managed_cluster(
735742 ray_cluster_name_actual = None
736743
737744 try :
738- print (f"Submitting RayJob '{ actual_job_cr_name } ' to namespace '{ namespace } '..." )
745+ print (
746+ f"Submitting RayJob '{ actual_job_cr_name } ' to namespace '{ namespace } '..."
747+ )
739748 k8s_co_api .create_namespaced_custom_object (
740749 group = "ray.io" ,
741750 version = "v1" ,
@@ -750,27 +759,37 @@ def run_job_with_managed_cluster(
750759 start_time = time .time ()
751760 while True :
752761 try :
753- ray_job_status_cr = k8s_co_api .get_namespaced_custom_object_status (
754- group = "ray.io" ,
755- version = "v1" ,
756- namespace = namespace ,
757- plural = "rayjobs" ,
758- name = actual_job_cr_name ,
762+ ray_job_status_cr = (
763+ k8s_co_api .get_namespaced_custom_object_status (
764+ group = "ray.io" ,
765+ version = "v1" ,
766+ namespace = namespace ,
767+ plural = "rayjobs" ,
768+ name = actual_job_cr_name ,
769+ )
759770 )
760771 except ApiException as e :
761772 if e .status == 404 :
762- print (f"RayJob '{ actual_job_cr_name } ' status not found yet, retrying..." )
773+ print (
774+ f"RayJob '{ actual_job_cr_name } ' status not found yet, retrying..."
775+ )
763776 time .sleep (job_polling_interval_seconds )
764777 continue
765778 raise
766779
767780 status_field = ray_job_status_cr .get ("status" , {})
768- job_deployment_status = status_field .get ("jobDeploymentStatus" , "UNKNOWN" )
781+ job_deployment_status = status_field .get (
782+ "jobDeploymentStatus" , "UNKNOWN"
783+ )
769784 current_job_status = status_field .get ("jobStatus" , "PENDING" )
770-
785+
771786 dashboard_url = status_field .get ("dashboardURL" , dashboard_url )
772- ray_cluster_name_actual = status_field .get ("rayClusterName" , ray_cluster_name_actual )
773- returned_job_submission_id = status_field .get ("jobId" , job_config .submission_id )
787+ ray_cluster_name_actual = status_field .get (
788+ "rayClusterName" , ray_cluster_name_actual
789+ )
790+ returned_job_submission_id = status_field .get (
791+ "jobId" , job_config .submission_id
792+ )
774793
775794 final_job_status = current_job_status
776795 print (
@@ -779,41 +798,72 @@ def run_job_with_managed_cluster(
779798
780799 if current_job_status in ["SUCCEEDED" , "FAILED" , "STOPPED" ]:
781800 break
782-
783- if job_timeout_seconds and (time .time () - start_time ) > job_timeout_seconds :
801+
802+ if (
803+ job_timeout_seconds
804+ and (time .time () - start_time ) > job_timeout_seconds
805+ ):
784806 try :
785- ray_job_status_cr_final = k8s_co_api .get_namespaced_custom_object_status (
786- group = "ray.io" , version = "v1" , namespace = namespace , plural = "rayjobs" , name = actual_job_cr_name
807+ ray_job_status_cr_final = (
808+ k8s_co_api .get_namespaced_custom_object_status (
809+ group = "ray.io" ,
810+ version = "v1" ,
811+ namespace = namespace ,
812+ plural = "rayjobs" ,
813+ name = actual_job_cr_name ,
814+ )
815+ )
816+ status_field_final = ray_job_status_cr_final .get (
817+ "status" , {}
818+ )
819+ final_job_status = status_field_final .get (
820+ "jobStatus" , final_job_status
821+ )
822+ returned_job_submission_id = status_field_final .get (
823+ "jobId" , returned_job_submission_id
824+ )
825+ dashboard_url = status_field_final .get (
826+ "dashboardURL" , dashboard_url
827+ )
828+ ray_cluster_name_actual = status_field_final .get (
829+ "rayClusterName" , ray_cluster_name_actual
787830 )
788- status_field_final = ray_job_status_cr_final .get ("status" , {})
789- final_job_status = status_field_final .get ("jobStatus" , final_job_status )
790- returned_job_submission_id = status_field_final .get ("jobId" , returned_job_submission_id )
791- dashboard_url = status_field_final .get ("dashboardURL" , dashboard_url )
792- ray_cluster_name_actual = status_field_final .get ("rayClusterName" , ray_cluster_name_actual )
793831 except Exception :
794832 pass
795833 raise TimeoutError (
796834 f"RayJob '{ actual_job_cr_name } ' timed out after { job_timeout_seconds } seconds. Last status: { final_job_status } "
797835 )
798836
799837 time .sleep (job_polling_interval_seconds )
800-
801- print (f"RayJob '{ actual_job_cr_name } ' finished with status: { final_job_status } " )
838+
839+ print (
840+ f"RayJob '{ actual_job_cr_name } ' finished with status: { final_job_status } "
841+ )
802842 else :
803843 try :
804844 ray_job_status_cr = k8s_co_api .get_namespaced_custom_object_status (
805- group = "ray.io" , version = "v1" , namespace = namespace , plural = "rayjobs" , name = actual_job_cr_name
845+ group = "ray.io" ,
846+ version = "v1" ,
847+ namespace = namespace ,
848+ plural = "rayjobs" ,
849+ name = actual_job_cr_name ,
806850 )
807851 status_field = ray_job_status_cr .get ("status" , {})
808852 final_job_status = status_field .get ("jobStatus" , "SUBMITTED" )
809- returned_job_submission_id = status_field .get ("jobId" , job_config .submission_id )
853+ returned_job_submission_id = status_field .get (
854+ "jobId" , job_config .submission_id
855+ )
810856 dashboard_url = status_field .get ("dashboardURL" , dashboard_url )
811- ray_cluster_name_actual = status_field .get ("rayClusterName" , ray_cluster_name_actual )
857+ ray_cluster_name_actual = status_field .get (
858+ "rayClusterName" , ray_cluster_name_actual
859+ )
812860 except ApiException as e :
813861 if e .status == 404 :
814862 final_job_status = "SUBMITTED_NOT_FOUND"
815863 else :
816- print (f"Warning: Could not fetch initial status for RayJob '{ actual_job_cr_name } ': { e } " )
864+ print (
865+ f"Warning: Could not fetch initial status for RayJob '{ actual_job_cr_name } ': { e } "
866+ )
817867 final_job_status = "UNKNOWN_API_ERROR"
818868
819869 return {
@@ -825,20 +875,30 @@ def run_job_with_managed_cluster(
825875 }
826876
827877 except ApiException as e :
828- print (f"Kubernetes API error during RayJob '{ actual_job_cr_name } ' management: { e .reason } (status: { e .status } )" )
878+ print (
879+ f"Kubernetes API error during RayJob '{ actual_job_cr_name } ' management: { e .reason } (status: { e .status } )"
880+ )
829881 final_status_on_error = "ERROR_BEFORE_SUBMISSION"
830882 if actual_job_cr_name :
831883 try :
832884 ray_job_status_cr = k8s_co_api .get_namespaced_custom_object_status (
833- group = "ray.io" , version = "v1" , namespace = namespace , plural = "rayjobs" , name = actual_job_cr_name
885+ group = "ray.io" ,
886+ version = "v1" ,
887+ namespace = namespace ,
888+ plural = "rayjobs" ,
889+ name = actual_job_cr_name ,
834890 )
835891 status_field = ray_job_status_cr .get ("status" , {})
836- final_status_on_error = status_field .get ("jobStatus" , "UNKNOWN_AFTER_K8S_ERROR" )
892+ final_status_on_error = status_field .get (
893+ "jobStatus" , "UNKNOWN_AFTER_K8S_ERROR"
894+ )
837895 except Exception :
838896 final_status_on_error = "UNKNOWN_FINAL_STATUS_FETCH_FAILED"
839897 raise
840898 except Exception as e :
841- print (f"An unexpected error occurred during managed RayJob execution for '{ actual_job_cr_name } ': { e } " )
899+ print (
900+ f"An unexpected error occurred during managed RayJob execution for '{ actual_job_cr_name } ': { e } "
901+ )
842902 raise
843903
844904
@@ -999,8 +1059,10 @@ def get_cluster(
9991059 )
10001060 # 1. Prepare RayClusterSpec from ClusterConfiguration
10011061 # Create a temporary config with appwrapper=False to ensure build_ray_cluster returns RayCluster YAML
1002- temp_cluster_config_dict = cluster_config .dict (exclude_none = True ) # Assuming Pydantic V1 or similar .dict() method
1003- temp_cluster_config_dict ['appwrapper' ] = False
1062+ temp_cluster_config_dict = cluster_config .dict (
1063+ exclude_none = True
1064+ ) # Assuming Pydantic V1 or similar .dict() method
1065+ temp_cluster_config_dict ["appwrapper" ] = False
10041066 temp_cluster_config_for_spec = ClusterConfiguration (** temp_cluster_config_dict )
10051067 # Ignore the warning here for the lack of a ClusterConfiguration
10061068 with warnings .catch_warnings ():
0 commit comments