Skip to content

Commit 85a2b9c

Browse files
authored
fix: Search for incluster config if no kubeconfig is given (#411)
* fix: Search for incluster config if no kubeconfig is given Signed-off-by: oliver könig <okoenig@nvidia.com> * fix Signed-off-by: oliver könig <okoenig@nvidia.com> * fix Signed-off-by: oliver könig <okoenig@nvidia.com> * fix Signed-off-by: oliver könig <okoenig@nvidia.com> * fix Signed-off-by: oliver könig <okoenig@nvidia.com> * format Signed-off-by: oliver könig <okoenig@nvidia.com> --------- Signed-off-by: oliver könig <okoenig@nvidia.com>
1 parent 4d4f041 commit 85a2b9c

File tree

3 files changed

+163
-3
lines changed

3 files changed

+163
-3
lines changed

nemo_run/run/ray/job.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,10 @@ def start(
9494
)
9595

9696
def stop(self, wait: bool = False) -> None:
97-
self.backend.stop(wait=wait) # type: ignore[attr-defined]
97+
if isinstance(self.backend, KubeRayJob):
98+
self.backend.stop() # type: ignore[attr-defined]
99+
else:
100+
self.backend.stop(wait=wait) # type: ignore[attr-defined]
98101

99102
def status(self, display: bool = True):
100103
return self.backend.status(display=display) # type: ignore[attr-defined]

nemo_run/run/ray/kuberay.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,20 @@ class KubeRayCluster:
6060
def __post_init__(self) -> None: # noqa: D401 – simple verb is fine
6161
"""Initialise Kubernetes API clients once the instance is created."""
6262
# Load local kube-config once; the function returns *None* so we don't store it.
63-
config.load_kube_config()
63+
try:
64+
config.load_kube_config()
65+
except Exception as kube_config_error:
66+
logger.error(
67+
"Error loading kube-config: %s, trying with incluster config", kube_config_error
68+
)
69+
try:
70+
config.load_incluster_config()
71+
except Exception as incluster_config_error:
72+
logger.error(
73+
"Error loading incluster config: %s, raising original error",
74+
incluster_config_error,
75+
)
76+
raise kube_config_error from incluster_config_error
6477

6578
# The dedicated clients are what we interact with throughout the class
6679
# – separating CoreV1 for pods/services from CustomObjects for CRDs.
@@ -732,7 +745,20 @@ class KubeRayJob:
732745
executor: KubeRayExecutor
733746

734747
def __post_init__(self):
735-
config.load_kube_config()
748+
try:
749+
config.load_kube_config()
750+
except Exception as kube_config_error:
751+
logger.error(
752+
"Error loading kube-config: %s, trying with incluster config", kube_config_error
753+
)
754+
try:
755+
config.load_incluster_config()
756+
except Exception as incluster_config_error:
757+
logger.error(
758+
"Error loading incluster config: %s, raising original error",
759+
incluster_config_error,
760+
)
761+
raise kube_config_error from incluster_config_error
736762

737763
# Lazily create K8s API clients if not supplied
738764
self.api = client.CustomObjectsApi()

test/run/ray/test_kuberay.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2074,3 +2074,134 @@ def test_cluster_create_without_lifecycle_kwargs(self, mock_k8s_clients):
20742074
# Should create lifecycle_kwargs and succeed
20752075
assert hasattr(executor, "lifecycle_kwargs")
20762076
assert mock_api.create_namespaced_custom_object.called
2077+
2078+
2079+
class TestKubeConfigLoadingFallback:
2080+
"""Test kube config loading with fallback to incluster config."""
2081+
2082+
def test_kuberay_cluster_kube_config_success(self):
2083+
"""Test KubeRayCluster when kube config loads successfully."""
2084+
with patch("nemo_run.run.ray.kuberay.config.load_kube_config") as mock_load_kube:
2085+
with patch("nemo_run.run.ray.kuberay.config.load_incluster_config") as mock_incluster:
2086+
with patch("nemo_run.run.ray.kuberay.client.CustomObjectsApi"):
2087+
with patch("nemo_run.run.ray.kuberay.client.CoreV1Api"):
2088+
with patch("nemo_run.run.ray.kuberay.get_user", return_value="testuser"):
2089+
executor = KubeRayExecutor(namespace="test-namespace")
2090+
# Create cluster to trigger __post_init__ which loads config
2091+
_ = KubeRayCluster(name="test-cluster", executor=executor)
2092+
2093+
# Verify kube config was loaded and incluster was NOT called
2094+
assert mock_load_kube.call_count >= 1
2095+
# incluster should not be called when kube config succeeds
2096+
mock_incluster.assert_not_called()
2097+
2098+
def test_kuberay_cluster_fallback_to_incluster(self):
2099+
"""Test KubeRayCluster falls back to incluster config when kube config fails."""
2100+
kube_error = Exception("Kube config file not found")
2101+
2102+
with patch(
2103+
"nemo_run.run.ray.kuberay.config.load_kube_config", side_effect=kube_error
2104+
) as mock_load_kube:
2105+
with patch("nemo_run.run.ray.kuberay.config.load_incluster_config") as mock_incluster:
2106+
with patch("nemo_run.run.ray.kuberay.client.CustomObjectsApi"):
2107+
with patch("nemo_run.run.ray.kuberay.client.CoreV1Api"):
2108+
with patch("nemo_run.run.ray.kuberay.get_user", return_value="testuser"):
2109+
executor = KubeRayExecutor(namespace="test-namespace")
2110+
# Create cluster to trigger __post_init__ which loads config
2111+
_ = KubeRayCluster(name="test-cluster", executor=executor)
2112+
2113+
# Verify both were called
2114+
assert mock_load_kube.call_count >= 1
2115+
assert mock_incluster.call_count >= 1
2116+
2117+
def test_kuberay_cluster_both_configs_fail(self):
2118+
"""Test KubeRayCluster raises original error when both configs fail."""
2119+
kube_error = Exception("Kube config file not found")
2120+
incluster_error = Exception("Not running inside a cluster")
2121+
2122+
with patch("nemo_run.run.ray.kuberay.config.load_kube_config", side_effect=kube_error):
2123+
with patch(
2124+
"nemo_run.run.ray.kuberay.config.load_incluster_config",
2125+
side_effect=incluster_error,
2126+
):
2127+
with pytest.raises(Exception) as exc_info:
2128+
with patch("nemo_run.run.ray.kuberay.get_user", return_value="testuser"):
2129+
executor = KubeRayExecutor(namespace="test-namespace")
2130+
KubeRayCluster(name="test-cluster", executor=executor)
2131+
2132+
# Should raise the original kube config error (not the incluster error)
2133+
assert exc_info.value == kube_error
2134+
assert "Kube config file not found" in str(exc_info.value)
2135+
2136+
def test_kuberay_job_kube_config_success(self):
2137+
"""Test KubeRayJob when kube config loads successfully."""
2138+
with patch("nemo_run.run.ray.kuberay.config.load_kube_config") as mock_load_kube:
2139+
with patch("nemo_run.run.ray.kuberay.config.load_incluster_config") as mock_incluster:
2140+
with patch("nemo_run.run.ray.kuberay.client.CustomObjectsApi"):
2141+
with patch("nemo_run.run.ray.kuberay.client.CoreV1Api"):
2142+
with patch("nemo_run.run.ray.kuberay.get_user", return_value="testuser"):
2143+
executor = KubeRayExecutor(namespace="test-namespace")
2144+
# Create job to trigger __post_init__ which loads config
2145+
_ = KubeRayJob(name="test-job", executor=executor)
2146+
2147+
# Verify kube config was loaded
2148+
assert mock_load_kube.call_count >= 1
2149+
# incluster should not be called when kube config succeeds
2150+
mock_incluster.assert_not_called()
2151+
2152+
def test_kuberay_job_fallback_to_incluster(self):
2153+
"""Test KubeRayJob falls back to incluster config when kube config fails."""
2154+
kube_error = Exception("Kube config file not found")
2155+
2156+
with patch(
2157+
"nemo_run.run.ray.kuberay.config.load_kube_config", side_effect=kube_error
2158+
) as mock_load_kube:
2159+
with patch("nemo_run.run.ray.kuberay.config.load_incluster_config") as mock_incluster:
2160+
with patch("nemo_run.run.ray.kuberay.client.CustomObjectsApi"):
2161+
with patch("nemo_run.run.ray.kuberay.client.CoreV1Api"):
2162+
with patch("nemo_run.run.ray.kuberay.get_user", return_value="testuser"):
2163+
executor = KubeRayExecutor(namespace="test-namespace")
2164+
# Create job to trigger __post_init__ which loads config
2165+
_ = KubeRayJob(name="test-job", executor=executor)
2166+
2167+
# Verify both were called
2168+
assert mock_load_kube.call_count >= 1
2169+
assert mock_incluster.call_count >= 1
2170+
2171+
def test_kuberay_job_both_configs_fail(self):
2172+
"""Test KubeRayJob raises original error when both configs fail."""
2173+
kube_error = Exception("Kube config file not found")
2174+
incluster_error = Exception("Not running inside a cluster")
2175+
2176+
with patch("nemo_run.run.ray.kuberay.config.load_kube_config", side_effect=kube_error):
2177+
with patch(
2178+
"nemo_run.run.ray.kuberay.config.load_incluster_config",
2179+
side_effect=incluster_error,
2180+
):
2181+
with pytest.raises(Exception) as exc_info:
2182+
with patch("nemo_run.run.ray.kuberay.get_user", return_value="testuser"):
2183+
executor = KubeRayExecutor(namespace="test-namespace")
2184+
KubeRayJob(name="test-job", executor=executor)
2185+
2186+
# Should raise the original kube config error (not the incluster error)
2187+
assert exc_info.value == kube_error
2188+
assert "Kube config file not found" in str(exc_info.value)
2189+
2190+
def test_error_chaining_preserved(self):
2191+
"""Test that error chaining is preserved (raise X from Y)."""
2192+
kube_error = Exception("Kube config file not found")
2193+
incluster_error = Exception("Not running inside a cluster")
2194+
2195+
with patch("nemo_run.run.ray.kuberay.config.load_kube_config", side_effect=kube_error):
2196+
with patch(
2197+
"nemo_run.run.ray.kuberay.config.load_incluster_config",
2198+
side_effect=incluster_error,
2199+
):
2200+
with pytest.raises(Exception) as exc_info:
2201+
with patch("nemo_run.run.ray.kuberay.get_user", return_value="testuser"):
2202+
executor = KubeRayExecutor(namespace="test-namespace")
2203+
KubeRayJob(name="test-job", executor=executor)
2204+
2205+
# Verify error chaining (raise kube_error from incluster_error)
2206+
assert exc_info.value == kube_error
2207+
assert exc_info.value.__cause__ == incluster_error

0 commit comments

Comments
 (0)