@@ -1462,6 +1462,322 @@ def test_run_job_with_managed_cluster_all_status_fields(mocker):
14621462 assert result ["job_submission_id" ] == "all-fields-job-456"
14631463
14641464
1465+ def test_run_job_with_managed_cluster_status_polling_exception (mocker ):
1466+ """Test RayJob with exception during status polling."""
1467+ from codeflare_sdk .ray .job .job import RayJobSpec
1468+ from kubernetes .client .rest import ApiException
1469+
1470+ # Mock dependencies
1471+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1472+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1473+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1474+
1475+ mock_api_client = mocker .Mock ()
1476+ mocker .patch (
1477+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1478+ return_value = mock_api_client ,
1479+ )
1480+
1481+ mock_co_api = mocker .Mock ()
1482+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1483+
1484+ # Mock Cluster creation
1485+ mock_cluster_resource = {
1486+ "apiVersion" : "ray.io/v1" ,
1487+ "kind" : "RayCluster" ,
1488+ "spec" : {},
1489+ }
1490+ mocker .patch (
1491+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1492+ )
1493+ mock_cluster_instance = mocker .Mock ()
1494+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1495+ mocker .patch (
1496+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1497+ )
1498+
1499+ # Mock RayJob creation
1500+ mock_co_api .create_namespaced_custom_object .return_value = {
1501+ "metadata" : {"name" : "test-status-exception" }
1502+ }
1503+
1504+ # Mock status polling to raise exception after first success
1505+ success_response = {
1506+ "status" : {
1507+ "jobDeploymentStatus" : "Running" ,
1508+ "jobStatus" : "RUNNING" ,
1509+ "jobId" : "exception-job-123" ,
1510+ }
1511+ }
1512+ exception = ApiException (status = 500 , reason = "Internal Server Error" )
1513+
1514+ mock_co_api .get_namespaced_custom_object_status .side_effect = [
1515+ success_response ,
1516+ exception ,
1517+ ]
1518+ mocker .patch ("time.sleep" )
1519+
1520+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1521+ job_config = RayJobSpec (entrypoint = "python script.py" )
1522+
1523+ try :
1524+ Cluster .run_job_with_managed_cluster (
1525+ cluster_config = cluster_config ,
1526+ job_config = job_config ,
1527+ wait_for_completion = True ,
1528+ job_timeout_seconds = 10 ,
1529+ )
1530+ assert False , "Expected ApiException"
1531+ except ApiException as e :
1532+ assert e .status == 500
1533+
1534+
1535+ def test_run_job_with_managed_cluster_empty_status (mocker ):
1536+ """Test RayJob with completely empty status."""
1537+ from codeflare_sdk .ray .job .job import RayJobSpec
1538+
1539+ # Mock dependencies
1540+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1541+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1542+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1543+
1544+ mock_api_client = mocker .Mock ()
1545+ mocker .patch (
1546+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1547+ return_value = mock_api_client ,
1548+ )
1549+
1550+ mock_co_api = mocker .Mock ()
1551+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1552+
1553+ # Mock Cluster creation
1554+ mock_cluster_resource = {
1555+ "apiVersion" : "ray.io/v1" ,
1556+ "kind" : "RayCluster" ,
1557+ "spec" : {},
1558+ }
1559+ mocker .patch (
1560+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1561+ )
1562+ mock_cluster_instance = mocker .Mock ()
1563+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1564+ mocker .patch (
1565+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1566+ )
1567+
1568+ # Mock RayJob creation
1569+ mock_co_api .create_namespaced_custom_object .return_value = {
1570+ "metadata" : {"name" : "test-empty-status" }
1571+ }
1572+
1573+ # Mock completely empty status response
1574+ empty_status_response = {}
1575+ mock_co_api .get_namespaced_custom_object_status .return_value = empty_status_response
1576+ mocker .patch ("time.sleep" )
1577+
1578+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1579+ job_config = RayJobSpec (entrypoint = "python script.py" )
1580+
1581+ # Should handle empty status gracefully with timeout
1582+ try :
1583+ Cluster .run_job_with_managed_cluster (
1584+ cluster_config = cluster_config ,
1585+ job_config = job_config ,
1586+ wait_for_completion = True ,
1587+ job_timeout_seconds = 2 ,
1588+ job_polling_interval_seconds = 1 ,
1589+ )
1590+ assert False , "Expected TimeoutError"
1591+ except TimeoutError as e :
1592+ assert "timed out after 2 seconds" in str (e )
1593+
1594+
1595+ def test_run_job_with_managed_cluster_no_metadata_name (mocker ):
1596+ """Test RayJob creation with missing metadata name."""
1597+ from codeflare_sdk .ray .job .job import RayJobSpec
1598+
1599+ # Mock dependencies
1600+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1601+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1602+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1603+
1604+ mock_api_client = mocker .Mock ()
1605+ mocker .patch (
1606+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1607+ return_value = mock_api_client ,
1608+ )
1609+
1610+ mock_co_api = mocker .Mock ()
1611+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1612+
1613+ # Mock Cluster creation
1614+ mock_cluster_resource = {
1615+ "apiVersion" : "ray.io/v1" ,
1616+ "kind" : "RayCluster" ,
1617+ "spec" : {},
1618+ }
1619+ mocker .patch (
1620+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1621+ )
1622+ mock_cluster_instance = mocker .Mock ()
1623+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1624+ mocker .patch (
1625+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1626+ )
1627+
1628+ # Mock RayJob creation with missing name in metadata - method generates its own name
1629+ mock_co_api .create_namespaced_custom_object .return_value = {"metadata" : {}}
1630+
1631+ # Mock status API response even for no-wait case
1632+ from kubernetes .client .rest import ApiException
1633+ mock_co_api .get_namespaced_custom_object_status .side_effect = ApiException (
1634+ status = 404 , reason = "Not Found"
1635+ )
1636+
1637+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1638+ job_config = RayJobSpec (entrypoint = "python script.py" )
1639+
1640+ # Should still work and generate a job name
1641+ result = Cluster .run_job_with_managed_cluster (
1642+ cluster_config = cluster_config ,
1643+ job_config = job_config ,
1644+ wait_for_completion = False ,
1645+ )
1646+
1647+ # Should have a generated job name (starts with 'rayjob-')
1648+ assert "job_cr_name" in result
1649+ assert result ["job_cr_name" ].startswith ("rayjob-" )
1650+ assert result ["job_status" ] == "SUBMITTED_NOT_FOUND"
1651+
1652+
1653+ def test_run_job_with_managed_cluster_default_namespace (mocker ):
1654+ """Test RayJob with default namespace."""
1655+ from codeflare_sdk .ray .job .job import RayJobSpec
1656+
1657+ # Mock dependencies
1658+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1659+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1660+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1661+
1662+ mock_api_client = mocker .Mock ()
1663+ mocker .patch (
1664+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1665+ return_value = mock_api_client ,
1666+ )
1667+
1668+ mock_co_api = mocker .Mock ()
1669+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1670+
1671+ # Mock Cluster creation
1672+ mock_cluster_resource = {
1673+ "apiVersion" : "ray.io/v1" ,
1674+ "kind" : "RayCluster" ,
1675+ "spec" : {},
1676+ }
1677+ mocker .patch (
1678+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1679+ )
1680+ mock_cluster_instance = mocker .Mock ()
1681+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1682+ mocker .patch (
1683+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1684+ )
1685+
1686+ # Mock RayJob creation
1687+ mock_co_api .create_namespaced_custom_object .return_value = {
1688+ "metadata" : {"name" : "test-default-ns" }
1689+ }
1690+
1691+ # Mock status API response even for no-wait case
1692+ from kubernetes .client .rest import ApiException
1693+ mock_co_api .get_namespaced_custom_object_status .side_effect = ApiException (
1694+ status = 404 , reason = "Not Found"
1695+ )
1696+
1697+ # Test with ClusterConfiguration that has namespace=None (should use default)
1698+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = None )
1699+ job_config = RayJobSpec (entrypoint = "python script.py" )
1700+
1701+ result = Cluster .run_job_with_managed_cluster (
1702+ cluster_config = cluster_config ,
1703+ job_config = job_config ,
1704+ wait_for_completion = False ,
1705+ )
1706+
1707+ # Method generates its own job name regardless of API response
1708+ assert "job_cr_name" in result
1709+ assert result ["job_cr_name" ].startswith ("rayjob-" )
1710+ assert result ["job_status" ] == "SUBMITTED_NOT_FOUND"
1711+
1712+
1713+ def test_run_job_with_managed_cluster_job_spec_with_runtime_env (mocker ):
1714+ """Test RayJob with runtime environment in job spec."""
1715+ from codeflare_sdk .ray .job .job import RayJobSpec
1716+
1717+ # Mock dependencies
1718+ mocker .patch ("kubernetes.client.ApisApi.get_api_versions" )
1719+ mocker .patch ("kubernetes.config.load_kube_config" , return_value = "ignore" )
1720+ mocker .patch ("codeflare_sdk.ray.cluster.cluster.config_check" )
1721+
1722+ mock_api_client = mocker .Mock ()
1723+ mocker .patch (
1724+ "codeflare_sdk.common.kubernetes_cluster.auth.get_api_client" ,
1725+ return_value = mock_api_client ,
1726+ )
1727+
1728+ mock_co_api = mocker .Mock ()
1729+ mocker .patch ("kubernetes.client.CustomObjectsApi" , return_value = mock_co_api )
1730+
1731+ # Mock Cluster creation
1732+ mock_cluster_resource = {
1733+ "apiVersion" : "ray.io/v1" ,
1734+ "kind" : "RayCluster" ,
1735+ "spec" : {},
1736+ }
1737+ mocker .patch (
1738+ "codeflare_sdk.ray.cluster.cluster.Cluster.__init__" , return_value = None
1739+ )
1740+ mock_cluster_instance = mocker .Mock ()
1741+ mock_cluster_instance .resource_yaml = mock_cluster_resource
1742+ mocker .patch (
1743+ "codeflare_sdk.ray.cluster.cluster.Cluster" , return_value = mock_cluster_instance
1744+ )
1745+
1746+ # Mock RayJob creation
1747+ mock_co_api .create_namespaced_custom_object .return_value = {
1748+ "metadata" : {"name" : "test-runtime-env" }
1749+ }
1750+
1751+ # Mock status API response even for no-wait case
1752+ from kubernetes .client .rest import ApiException
1753+ mock_co_api .get_namespaced_custom_object_status .side_effect = ApiException (
1754+ status = 404 , reason = "Not Found"
1755+ )
1756+
1757+ cluster_config = ClusterConfiguration (name = "test-cluster" , namespace = "test-ns" )
1758+
1759+ # Create job spec with runtime environment
1760+ job_config = RayJobSpec (
1761+ entrypoint = "python script.py" ,
1762+ runtime_env = {"pip" : ["numpy" , "pandas" ]},
1763+ metadata = {"job_timeout_s" : 3600 },
1764+ )
1765+
1766+ result = Cluster .run_job_with_managed_cluster (
1767+ cluster_config = cluster_config ,
1768+ job_config = job_config ,
1769+ wait_for_completion = False ,
1770+ )
1771+
1772+ # Method generates its own job name regardless of API response
1773+ assert "job_cr_name" in result
1774+ assert result ["job_cr_name" ].startswith ("rayjob-" )
1775+ assert result ["job_status" ] == "SUBMITTED_NOT_FOUND"
1776+
1777+ # Verify the CustomObjectsApi was called with proper parameters
1778+ mock_co_api .create_namespaced_custom_object .assert_called_once ()
1779+
1780+
14651781# Make sure to always keep this function last
14661782def test_cleanup ():
14671783 os .remove (f"{ aw_dir } test-all-params.yaml" )
0 commit comments