Skip to content

Commit 80c97eb

Browse files
yonromaiclaude
andauthored
fix(iris): auto-capture controller pod diagnostics on connection failure (#3107)
## Summary - When `iris job run` fails with a connection error, automatically log controller pod status (phase, termination reason, restart count) and previous container logs - Helps diagnose controller OOM/crash-loop failures without manual `kubectl` triage <details> <summary>Context</summary> Extracted from #3090. During K8s pod-creation storms the controller would OOM and crash, but the user only saw "connection refused." This surfaces the actual crash reason inline. Related: #3102 (controller RBAC/scheduling), #3103 (worker kubectl saturation) </details> ## Test plan - [x] Pre-commit clean - [ ] CI 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d318908 commit 80c97eb

File tree

4 files changed

+68
-20
lines changed

4 files changed

+68
-20
lines changed

lib/iris/src/iris/cli/job.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -459,26 +459,35 @@ def run(
459459

460460
env_vars_dict = load_env_vars(env_vars)
461461

462-
exit_code = run_iris_job(
463-
command=command,
464-
env_vars=env_vars_dict,
465-
controller_url=controller_url,
466-
tpu=tpu,
467-
gpu=gpu,
468-
cpu=cpu,
469-
memory=memory,
470-
disk=disk,
471-
wait=not no_wait,
472-
job_name=job_name,
473-
replicas=replicas,
474-
max_retries=max_retries,
475-
timeout=timeout,
476-
extras=list(extra),
477-
include_children_logs=include_children_logs,
478-
terminate_on_exit=terminate_on_exit,
479-
regions=region or None,
480-
zone=zone,
481-
)
462+
try:
463+
exit_code = run_iris_job(
464+
command=command,
465+
env_vars=env_vars_dict,
466+
controller_url=controller_url,
467+
tpu=tpu,
468+
gpu=gpu,
469+
cpu=cpu,
470+
memory=memory,
471+
disk=disk,
472+
wait=not no_wait,
473+
job_name=job_name,
474+
replicas=replicas,
475+
max_retries=max_retries,
476+
timeout=timeout,
477+
extras=list(extra),
478+
include_children_logs=include_children_logs,
479+
terminate_on_exit=terminate_on_exit,
480+
regions=region or None,
481+
zone=zone,
482+
)
483+
except Exception:
484+
platform = ctx.obj.get("platform")
485+
if platform is not None:
486+
try:
487+
platform.debug_report()
488+
except Exception:
489+
logger.debug("Controller post-mortem failed", exc_info=True)
490+
raise
482491
sys.exit(exit_code)
483492

484493

lib/iris/src/iris/cli/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def require_controller_url(ctx: click.Context) -> str:
6565

6666
iris_config = IrisConfig(config)
6767
platform = iris_config.platform()
68+
ctx.obj["platform"] = platform
6869

6970
if iris_config.proto.controller.WhichOneof("controller") == "local":
7071
from iris.cluster.controller.local import LocalController

lib/iris/src/iris/cluster/platform/base.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,14 @@ def tunnel(
381381
"""
382382
...
383383

384+
def debug_report(self) -> None:
385+
"""Log diagnostic info about the controller after a failure.
386+
387+
Override to inspect platform-specific state (e.g. pod termination
388+
reason, previous container logs). Default is a no-op.
389+
"""
390+
...
391+
384392
def shutdown(self) -> None:
385393
"""Release platform-owned resources (threads, connections, caches).
386394

lib/iris/src/iris/cluster/platform/coreweave.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,36 @@ def _wait_for_deployment_ready(self) -> None:
12701270
# a couple of attempts in case the first crash was transient.
12711271
_CRASH_LOOP_MIN_RESTARTS = 2
12721272

1273+
def debug_report(self) -> None:
1274+
"""Log controller pod termination reason and previous container logs."""
1275+
pods = self._kubectl.list_json("pods", labels={"app": "iris-controller"})
1276+
if not pods:
1277+
logger.warning("Post-mortem: no controller pods found")
1278+
return
1279+
1280+
for pod in pods:
1281+
name = pod.get("metadata", {}).get("name", "unknown")
1282+
phase = pod.get("status", {}).get("phase", "Unknown")
1283+
1284+
for cs in pod.get("status", {}).get("containerStatuses", []):
1285+
restarts = cs.get("restartCount", 0)
1286+
terminated = cs.get("lastState", {}).get("terminated", {})
1287+
if terminated:
1288+
logger.warning(
1289+
"Post-mortem %s: phase=%s reason=%s exitCode=%s restarts=%d",
1290+
name,
1291+
phase,
1292+
terminated.get("reason"),
1293+
terminated.get("exitCode"),
1294+
restarts,
1295+
)
1296+
else:
1297+
logger.warning("Post-mortem %s: phase=%s restarts=%d", name, phase, restarts)
1298+
1299+
prev_logs = self._kubectl.logs(name, tail=50, previous=True)
1300+
if prev_logs:
1301+
logger.warning("Post-mortem %s previous logs:\n%s", name, prev_logs)
1302+
12731303
def _check_controller_pods_health(self) -> None:
12741304
"""Check controller Pods for fatal conditions and fail fast.
12751305

0 commit comments

Comments
 (0)