Skip to content

Commit 801fac0

Browse files
yonromaiclaude
andauthored
test(iris): local K8s e2e with auto-provisioned kind cluster (#3097)
## Summary - Add `k8s_cluster` (session-scoped) and `k8s_runtime` (per-test) fixtures to `conftest.py` — reusable by any e2e test - `k8s_cluster` auto-creates a kind cluster if no K8s cluster is reachable, tears it down after the session - `k8s_runtime` creates an ephemeral namespace per test for isolation - Add `test_kubernetes_runtime_lifecycle` — validates the full `KubernetesRuntime` lifecycle (create pod, run, succeed, read logs) against real K8s - Marked `slow` + `e2e` so default CI is unaffected - Update `TESTING.md` with the run command ### Motivation Relates to #2944, #3062. The CW canary ferry is unreliable for validating K8s runtime behavior (RBAC issues, Connection refused — see #3091). This test validates `KubernetesRuntime` against any local cluster in seconds. ### How to use ```bash # Just run it — kind cluster is auto-created if needed (requires: brew install kind) uv run pytest lib/iris/tests/e2e/test_coreweave_live_kubernetes_runtime.py \ -m slow -k kubernetes -v -o "addopts=" # Or bring your own cluster (kind, k3d, minikube, remote) kind create cluster --name iris-test uv run pytest lib/iris/tests/e2e/test_coreweave_live_kubernetes_runtime.py \ -m slow -k kubernetes -v -o "addopts=" ``` ## Test plan - [x] Test passes with auto-created kind cluster (~49s including cluster create/teardown) - [x] Test passes with pre-existing kind cluster (~14s) - [x] Default `uv run pytest` skips the test (marked `slow`) - [x] Fixture auto-skips when no cluster reachable and kind not installed - [x] Pre-commit passes 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 80c97eb commit 801fac0

File tree

3 files changed

+111
-5
lines changed

3 files changed

+111
-5
lines changed

lib/iris/TESTING.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,4 +161,8 @@ IRIS_SCREENSHOT_DIR=/tmp/shots uv run pytest lib/iris/tests/e2e/test_dashboard.p
161161

162162
# When modifying the dashboard
163163
uv run pytest lib/iris/tests/e2e/test_dashboard.py -x -o "addopts="
164+
165+
# K8s runtime tests (requires a running cluster — kind, k3d, minikube, etc.)
166+
uv run pytest lib/iris/tests/e2e/test_coreweave_live_kubernetes_runtime.py \
167+
-m slow -k lifecycle -v
164168
```

lib/iris/tests/e2e/conftest.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,16 @@
1313
"""
1414

1515
import os
16+
import shutil
17+
import subprocess
1618
import time
19+
import uuid
1720
from dataclasses import dataclass
1821
from pathlib import Path
1922

2023
import pytest
2124
from iris.chaos import reset_chaos
25+
from iris.cluster.runtime.kubernetes import KubernetesRuntime
2226
from iris.client.client import IrisClient, Job
2327
from iris.cluster.config import load_config, make_local_config
2428
from iris.cluster.manager import connect_cluster
@@ -420,3 +424,83 @@ def capture(label: str) -> Path:
420424
return path
421425

422426
return capture
427+
428+
429+
# ---------------------------------------------------------------------------
430+
# Kubernetes fixtures (for tests against real K8s: kind, k3d, minikube, etc.)
431+
# ---------------------------------------------------------------------------
432+
433+
KIND_CLUSTER_NAME = "iris-test"
434+
435+
436+
def _cluster_reachable() -> bool:
437+
try:
438+
result = subprocess.run(["kubectl", "cluster-info"], capture_output=True, timeout=10)
439+
return result.returncode == 0
440+
except (subprocess.TimeoutExpired, FileNotFoundError):
441+
return False
442+
443+
444+
@pytest.fixture(scope="session")
445+
def k8s_cluster():
446+
"""Ensure a K8s cluster is available for the test session.
447+
448+
If a cluster is already reachable, uses it as-is. Otherwise, creates a
449+
kind cluster and tears it down at the end of the session.
450+
"""
451+
if shutil.which("kubectl") is None:
452+
pytest.skip("kubectl not in PATH (install: brew install kubectl)")
453+
454+
if _cluster_reachable():
455+
yield
456+
return
457+
458+
if shutil.which("kind") is None:
459+
pytest.skip("no reachable K8s cluster and kind not in PATH (install: brew install kind)")
460+
461+
subprocess.run(
462+
["kind", "create", "cluster", "--name", KIND_CLUSTER_NAME],
463+
check=True,
464+
timeout=120,
465+
)
466+
try:
467+
yield
468+
finally:
469+
subprocess.run(
470+
["kind", "delete", "cluster", "--name", KIND_CLUSTER_NAME],
471+
capture_output=True,
472+
timeout=60,
473+
)
474+
475+
476+
@pytest.fixture
477+
def k8s_runtime(k8s_cluster):
478+
"""KubernetesRuntime with an ephemeral namespace, torn down after each test."""
479+
namespace = f"iris-test-{uuid.uuid4().hex[:8]}"
480+
subprocess.run(
481+
["kubectl", "create", "namespace", namespace],
482+
check=True,
483+
capture_output=True,
484+
)
485+
# Wait for K8s to provision the default ServiceAccount in the new namespace.
486+
# Without this, pod creation fails with "serviceaccount default not found".
487+
deadline = time.monotonic() + 30
488+
while time.monotonic() < deadline:
489+
result = subprocess.run(
490+
["kubectl", "-n", namespace, "get", "serviceaccount", "default"],
491+
capture_output=True,
492+
)
493+
if result.returncode == 0:
494+
break
495+
time.sleep(0.5)
496+
else:
497+
pytest.skip(f"default ServiceAccount not ready in namespace {namespace} after 30s")
498+
runtime = KubernetesRuntime(namespace=namespace)
499+
try:
500+
yield runtime
501+
finally:
502+
runtime.cleanup()
503+
subprocess.run(
504+
["kubectl", "delete", "namespace", namespace, "--ignore-not-found"],
505+
capture_output=True,
506+
)

lib/iris/tests/e2e/test_coreweave_live_kubernetes_runtime.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import io
99
import os
1010
import posixpath
11-
import shutil
1211
import time
1312
import uuid
1413
from contextlib import contextmanager
@@ -162,7 +161,29 @@ def _coreweave_upload_env(config) -> object:
162161
fsspec.config.conf.pop("s3", None)
163162

164163

165-
@pytest.mark.skipif(shutil.which("kubectl") is None, reason="kubectl is not available")
164+
@pytest.mark.timeout(120)
165+
def test_kubernetes_runtime_lifecycle(k8s_runtime: KubernetesRuntime):
166+
"""Full KubernetesRuntime lifecycle: create pod, run, succeed, read logs."""
167+
run_id = uuid.uuid4().hex[:8]
168+
config = ContainerConfig(
169+
image="python:3.11-slim",
170+
entrypoint=_entrypoint(["bash", "-c", "echo lifecycle-test-ok && sleep 2"]),
171+
env={},
172+
workdir="/app",
173+
task_id=f"lifecycle-{run_id}",
174+
resources=cluster_pb2.ResourceSpecProto(cpu_millicores=100, memory_bytes=64 * 1024**2),
175+
)
176+
177+
handle = k8s_runtime.create_container(config)
178+
handle.run()
179+
180+
state = _wait_finished(handle, timeout_seconds=60)
181+
assert state == cluster_pb2.TASK_STATE_SUCCEEDED
182+
183+
logs = handle.log_reader().read_all()
184+
assert any("lifecycle-test-ok" in line.data for line in logs)
185+
186+
166187
@pytest.mark.timeout(1800)
167188
def test_coreweave_kubernetes_runtime_cpu_job_live(coreweave_runtime: KubernetesRuntime):
168189
"""CPU pod should extract bundle and complete successfully via KubernetesRuntime."""
@@ -225,7 +246,6 @@ def test_coreweave_kubernetes_runtime_cpu_job_live(coreweave_runtime: Kubernetes
225246
pass
226247

227248

228-
@pytest.mark.skipif(shutil.which("kubectl") is None, reason="kubectl is not available")
229249
@pytest.mark.timeout(1800)
230250
def test_incremental_log_reader_no_duplicates(coreweave_runtime: KubernetesRuntime):
231251
"""Incremental log reads via byte-offset cursor must not produce duplicate lines.
@@ -286,7 +306,6 @@ def test_incremental_log_reader_no_duplicates(coreweave_runtime: KubernetesRunti
286306
assert numbered == expected
287307

288308

289-
@pytest.mark.skipif(shutil.which("kubectl") is None, reason="kubectl is not available")
290309
@pytest.mark.timeout(3600)
291310
def test_coreweave_kubernetes_runtime_gpu_job_live(coreweave_runtime: KubernetesRuntime):
292311
"""GPU pod should request GPU and prove device access via nvidia-smi."""
@@ -327,7 +346,6 @@ def test_coreweave_kubernetes_runtime_gpu_job_live(coreweave_runtime: Kubernetes
327346
assert gpu_state == cluster_pb2.TASK_STATE_SUCCEEDED, f"gpu pod failed logs={gpu_logs}"
328347

329348

330-
@pytest.mark.skipif(shutil.which("kubectl") is None, reason="kubectl is not available")
331349
@pytest.mark.timeout(600)
332350
def test_tensorstore_s3_roundtrip():
333351
"""Verify tensorstore can write and read zarr3 data via S3-compatible storage.

0 commit comments

Comments
 (0)