@@ -975,6 +975,374 @@ def test_run_job_with_managed_cluster_validation_error(mocker):
975975 assert "entrypoint must be specified" in str (e )
976976
977977
978+ def test_run_job_with_managed_cluster_failed_job (mocker ):
979+ """Test RayJob execution when job fails."""
980+ from codeflare_sdk .ray .job .job import RayJobSpec
981+
982+ # Mock dependencies
983+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
984+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
985+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
986+
987+ mock_api_client = mocker .Mock ()
988+ mocker .patch ("codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" , return_value = mock_api_client )
989+
990+ mock_co_api = mocker .Mock ()
991+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
992+
993+ # Mock Cluster creation
994+ mock_cluster_resource = {"apiVersion" : "ray.io/v1" , "kind" : "RayCluster" , "spec" : {}}
995+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None )
996+ mock_cluster_instance = mocker .Mock ()
997+ mock_cluster_instance .resource_yaml = mock_cluster_resource
998+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance )
999+
1000+ # Mock RayJob creation
1001+ mock_co_api .create_namespaced_custom_object .return_value = {"metadata" : {"name" : "test-failed-job" }}
1002+
1003+ # Mock job status as FAILED
1004+ mock_status_response = {
1005+ "status" : {
1006+ "jobDeploymentStatus" : "Running" ,
1007+ "jobStatus" : "FAILED" ,
1008+ "jobId" : "failed-job-123" ,
1009+ "rayClusterName" : "test-cluster" ,
1010+ "message" : "Job failed due to error"
1011+ }
1012+ }
1013+ mock_co_api .get_namespaced_custom_object_status .return_value = mock_status_response
1014+ mocker .patch ("time.sleep" )
1015+
1016+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1017+ job_config = RayJobSpec (entrypoint = "python failing_script.py" )
1018+
1019+ result = Cluster .run_job_with_managed_cluster (
1020+ cluster_config = cluster_config ,
1021+ job_config = job_config ,
1022+ wait_for_completion = True
1023+ )
1024+
1025+ assert result ["job_status" ] == "FAILED"
1026+ assert result ["job_submission_id" ] == "failed-job-123"
1027+
1028+
1029+ def test_run_job_with_managed_cluster_stopped_job (mocker ):
1030+ """Test RayJob execution when job is stopped."""
1031+ from codeflare_sdk .ray .job .job import RayJobSpec
1032+
1033+ # Mock dependencies
1034+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1035+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1036+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1037+
1038+ mock_api_client = mocker .Mock ()
1039+ mocker .patch ("codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" , return_value = mock_api_client )
1040+
1041+ mock_co_api = mocker .Mock ()
1042+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1043+
1044+ # Mock Cluster creation
1045+ mock_cluster_resource = {"apiVersion" : "ray.io/v1" , "kind" : "RayCluster" , "spec" : {}}
1046+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None )
1047+ mock_cluster_instance = mocker .Mock ()
1048+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1049+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance )
1050+
1051+ # Mock RayJob creation
1052+ mock_co_api .create_namespaced_custom_object .return_value = {"metadata" : {"name" : "test-stopped-job" }}
1053+
1054+ # Mock job status as STOPPED
1055+ mock_status_response = {
1056+ "status" : {
1057+ "jobDeploymentStatus" : "Running" ,
1058+ "jobStatus" : "STOPPED" ,
1059+ "jobId" : "stopped-job-123"
1060+ }
1061+ }
1062+ mock_co_api .get_namespaced_custom_object_status .return_value = mock_status_response
1063+ mocker .patch ("time.sleep" )
1064+
1065+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1066+ job_config = RayJobSpec (entrypoint = "python script.py" )
1067+
1068+ result = Cluster .run_job_with_managed_cluster (
1069+ cluster_config = cluster_config ,
1070+ job_config = job_config ,
1071+ wait_for_completion = True
1072+ )
1073+
1074+ assert result ["job_status" ] == "STOPPED"
1075+ assert result ["job_submission_id" ] == "stopped-job-123"
1076+
1077+
1078+ def test_run_job_with_managed_cluster_pending_job (mocker ):
1079+ """Test RayJob execution when job is pending then succeeds."""
1080+ from codeflare_sdk .ray .job .job import RayJobSpec
1081+
1082+ # Mock dependencies
1083+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1084+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1085+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1086+
1087+ mock_api_client = mocker .Mock ()
1088+ mocker .patch ("codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" , return_value = mock_api_client )
1089+
1090+ mock_co_api = mocker .Mock ()
1091+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1092+
1093+ # Mock Cluster creation
1094+ mock_cluster_resource = {"apiVersion" : "ray.io/v1" , "kind" : "RayCluster" , "spec" : {}}
1095+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None )
1096+ mock_cluster_instance = mocker .Mock ()
1097+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1098+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance )
1099+
1100+ # Mock RayJob creation
1101+ mock_co_api .create_namespaced_custom_object .return_value = {"metadata" : {"name" : "test-pending-job" }}
1102+
1103+ # Mock job status progression: PENDING -> RUNNING -> SUCCEEDED
1104+ pending_response = {
1105+ "status" : {
1106+ "jobDeploymentStatus" : "Initializing" ,
1107+ "jobStatus" : "PENDING" ,
1108+ "jobId" : "pending-job-123"
1109+ }
1110+ }
1111+ running_response = {
1112+ "status" : {
1113+ "jobDeploymentStatus" : "Running" ,
1114+ "jobStatus" : "RUNNING" ,
1115+ "jobId" : "pending-job-123"
1116+ }
1117+ }
1118+ succeeded_response = {
1119+ "status" : {
1120+ "jobDeploymentStatus" : "Running" ,
1121+ "jobStatus" : "SUCCEEDED" ,
1122+ "jobId" : "pending-job-123"
1123+ }
1124+ }
1125+
1126+ mock_co_api .get_namespaced_custom_object_status .side_effect = [
1127+ pending_response , running_response , succeeded_response
1128+ ]
1129+ mocker .patch ("time.sleep" )
1130+
1131+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1132+ job_config = RayJobSpec (entrypoint = "python script.py" )
1133+
1134+ result = Cluster .run_job_with_managed_cluster (
1135+ cluster_config = cluster_config ,
1136+ job_config = job_config ,
1137+ wait_for_completion = True
1138+ )
1139+
1140+ assert result ["job_status" ] == "SUCCEEDED"
1141+ assert result ["job_submission_id" ] == "pending-job-123"
1142+
1143+
1144+ def test_run_job_with_managed_cluster_api_exception (mocker ):
1145+ """Test RayJob creation with API exception."""
1146+ from codeflare_sdk .ray .job .job import RayJobSpec
1147+ from kubernetes .client .rest import ApiException
1148+
1149+ # Mock dependencies
1150+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1151+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1152+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1153+
1154+ mock_api_client = mocker .Mock ()
1155+ mocker .patch ("codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" , return_value = mock_api_client )
1156+
1157+ mock_co_api = mocker .Mock ()
1158+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1159+
1160+ # Mock Cluster creation
1161+ mock_cluster_resource = {"apiVersion" : "ray.io/v1" , "kind" : "RayCluster" , "spec" : {}}
1162+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None )
1163+ mock_cluster_instance = mocker .Mock ()
1164+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1165+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance )
1166+
1167+ # Mock API exception during job creation
1168+ mock_co_api .create_namespaced_custom_object .side_effect = ApiException (
1169+ status = 400 , reason = "Bad Request" , body = '{"message": "Invalid RayJob spec"}'
1170+ )
1171+
1172+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1173+ job_config = RayJobSpec (entrypoint = "python script.py" )
1174+
1175+ try :
1176+ Cluster .run_job_with_managed_cluster (
1177+ cluster_config = cluster_config ,
1178+ job_config = job_config
1179+ )
1180+ assert False , "Expected ApiException"
1181+ except ApiException as e :
1182+ assert e .status == 400
1183+ assert "Bad Request" in str (e )
1184+
1185+
1186+ def test_run_job_with_managed_cluster_missing_status_fields (mocker ):
1187+ """Test RayJob with missing status fields."""
1188+ from codeflare_sdk .ray .job .job import RayJobSpec
1189+
1190+ # Mock dependencies
1191+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1192+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1193+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1194+
1195+ mock_api_client = mocker .Mock ()
1196+ mocker .patch ("codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" , return_value = mock_api_client )
1197+
1198+ mock_co_api = mocker .Mock ()
1199+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1200+
1201+ # Mock Cluster creation
1202+ mock_cluster_resource = {"apiVersion" : "ray.io/v1" , "kind" : "RayCluster" , "spec" : {}}
1203+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None )
1204+ mock_cluster_instance = mocker .Mock ()
1205+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1206+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance )
1207+
1208+ # Mock RayJob creation
1209+ mock_co_api .create_namespaced_custom_object .return_value = {"metadata" : {"name" : "test-missing-fields" }}
1210+
1211+ # Mock job status with missing fields
1212+ mock_status_response = {
1213+ "status" : {
1214+ "jobDeploymentStatus" : "Running"
1215+ # Missing jobStatus, jobId, etc.
1216+ }
1217+ }
1218+ mock_co_api .get_namespaced_custom_object_status .return_value = mock_status_response
1219+ mocker .patch ("time.sleep" )
1220+
1221+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1222+ job_config = RayJobSpec (entrypoint = "python script.py" )
1223+
1224+ result = Cluster .run_job_with_managed_cluster (
1225+ cluster_config = cluster_config ,
1226+ job_config = job_config ,
1227+ wait_for_completion = True ,
1228+ job_timeout_seconds = 5 ,
1229+ job_polling_interval_seconds = 1
1230+ )
1231+
1232+ # Should handle missing fields gracefully
1233+ assert "job_cr_name" in result
1234+ assert result .get ("job_status" ) is None or result .get ("job_status" ) == "UNKNOWN"
1235+
1236+
1237+ def test_run_job_with_managed_cluster_custom_job_name (mocker ):
1238+ """Test RayJob with custom job CR name."""
1239+ from codeflare_sdk .ray .job .job import RayJobSpec
1240+
1241+ # Mock dependencies
1242+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1243+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1244+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1245+
1246+ mock_api_client = mocker .Mock ()
1247+ mocker .patch ("codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" , return_value = mock_api_client )
1248+
1249+ mock_co_api = mocker .Mock ()
1250+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1251+
1252+ # Mock Cluster creation
1253+ mock_cluster_resource = {"apiVersion" : "ray.io/v1" , "kind" : "RayCluster" , "spec" : {}}
1254+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None )
1255+ mock_cluster_instance = mocker .Mock ()
1256+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1257+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance )
1258+
1259+ # Mock successful RayJob creation with custom name
1260+ custom_job_name = "my-custom-rayjob-name"
1261+ mock_co_api .create_namespaced_custom_object .return_value = {"metadata" : {"name" : custom_job_name }}
1262+
1263+ # Mock job completion
1264+ mock_status_response = {
1265+ "status" : {
1266+ "jobDeploymentStatus" : "Running" ,
1267+ "jobStatus" : "SUCCEEDED" ,
1268+ "jobId" : "custom-job-123"
1269+ }
1270+ }
1271+ mock_co_api .get_namespaced_custom_object_status .return_value = mock_status_response
1272+ mocker .patch ("time.sleep" )
1273+
1274+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1275+ job_config = RayJobSpec (entrypoint = "python script.py" )
1276+
1277+ result = Cluster .run_job_with_managed_cluster (
1278+ cluster_config = cluster_config ,
1279+ job_config = job_config ,
1280+ job_cr_name = custom_job_name ,
1281+ wait_for_completion = True
1282+ )
1283+
1284+ assert result ["job_cr_name" ] == custom_job_name
1285+ assert result ["job_submission_id" ] == "custom-job-123"
1286+ assert result ["job_status" ] == "SUCCEEDED"
1287+
1288+
1289+ def test_run_job_with_managed_cluster_all_status_fields (mocker ):
1290+ """Test RayJob with all possible status fields populated."""
1291+ from codeflare_sdk .ray .job .job import RayJobSpec
1292+
1293+ # Mock dependencies
1294+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1295+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1296+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1297+
1298+ mock_api_client = mocker .Mock ()
1299+ mocker .patch ("codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" , return_value = mock_api_client )
1300+
1301+ mock_co_api = mocker .Mock ()
1302+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1303+
1304+ # Mock Cluster creation
1305+ mock_cluster_resource = {"apiVersion" : "ray.io/v1" , "kind" : "RayCluster" , "spec" : {}}
1306+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None )
1307+ mock_cluster_instance = mocker .Mock ()
1308+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1309+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance )
1310+
1311+ # Mock RayJob creation
1312+ mock_co_api .create_namespaced_custom_object .return_value = {"metadata" : {"name" : "test-all-fields" }}
1313+
1314+ # Mock job status with all fields
1315+ mock_status_response = {
1316+ "status" : {
1317+ "jobDeploymentStatus" : "Complete" ,
1318+ "jobStatus" : "SUCCEEDED" ,
1319+ "dashboardURL" : "http://ray-dashboard:8265" ,
1320+ "rayClusterName" : "test-cluster-name" ,
1321+ "jobId" : "all-fields-job-456" ,
1322+ "message" : "Job completed successfully" ,
1323+ "startTime" : "2023-01-01T10:00:00Z" ,
1324+ "endTime" : "2023-01-01T10:05:00Z"
1325+ }
1326+ }
1327+ mock_co_api .get_namespaced_custom_object_status .return_value = mock_status_response
1328+ mocker .patch ("time.sleep" )
1329+
1330+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1331+ job_config = RayJobSpec (entrypoint = "python comprehensive_script.py" )
1332+
1333+ result = Cluster .run_job_with_managed_cluster (
1334+ cluster_config = cluster_config ,
1335+ job_config = job_config ,
1336+ wait_for_completion = True
1337+ )
1338+
1339+ # Verify all fields are captured
1340+ assert result ["job_status" ] == "SUCCEEDED"
1341+ assert result ["dashboard_url" ] == "http://ray-dashboard:8265"
1342+ assert result ["ray_cluster_name" ] == "test-cluster-name"
1343+ assert result ["job_submission_id" ] == "all-fields-job-456"
1344+
1345+
9781346# Make sure to always keep this function last
9791347def test_cleanup ():
9801348 os .remove (f"{ aw_dir } test-all-params.yaml" )
0 commit comments