fix(iris): auto-capture controller pod diagnostics on connection failure (#3107)

yonromai · claude · web-flow · commit 80c97ebfc577 · 2026-02-27T12:28:30.000-08:00
## Summary - When `iris job run` fails with a connection error, automatically log controller pod status (phase, termination reason, restart count) and previous container logs - Helps diagnose controller OOM/crash-loop failures without manual `kubectl` triage <details> <summary>Context</summary> Extracted from #3090. During K8s pod-creation storms the controller would OOM and crash, but the user only saw "connection refused." This surfaces the actual crash reason inline. Related: #3102 (controller RBAC/scheduling), #3103 (worker kubectl saturation) </details> ## Test plan - [x] Pre-commit clean - [ ] CI 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/lib/iris/src/iris/cli/job.py b/lib/iris/src/iris/cli/job.py
@@ -459,26 +459,35 @@ def run(
 
     env_vars_dict = load_env_vars(env_vars)
 
-    exit_code = run_iris_job(
-        command=command,
-        env_vars=env_vars_dict,
-        controller_url=controller_url,
-        tpu=tpu,
-        gpu=gpu,
-        cpu=cpu,
-        memory=memory,
-        disk=disk,
-        wait=not no_wait,
-        job_name=job_name,
-        replicas=replicas,
-        max_retries=max_retries,
-        timeout=timeout,
-        extras=list(extra),
-        include_children_logs=include_children_logs,
-        terminate_on_exit=terminate_on_exit,
-        regions=region or None,
-        zone=zone,
-    )
+    try:
+        exit_code = run_iris_job(
+            command=command,
+            env_vars=env_vars_dict,
+            controller_url=controller_url,
+            tpu=tpu,
+            gpu=gpu,
+            cpu=cpu,
+            memory=memory,
+            disk=disk,
+            wait=not no_wait,
+            job_name=job_name,
+            replicas=replicas,
+            max_retries=max_retries,
+            timeout=timeout,
+            extras=list(extra),
+            include_children_logs=include_children_logs,
+            terminate_on_exit=terminate_on_exit,
+            regions=region or None,
+            zone=zone,
+        )
+    except Exception:
+        platform = ctx.obj.get("platform")
+        if platform is not None:
+            try:
+                platform.debug_report()
+            except Exception:
+                logger.debug("Controller post-mortem failed", exc_info=True)
+        raise
     sys.exit(exit_code)
 
 
diff --git a/lib/iris/src/iris/cli/main.py b/lib/iris/src/iris/cli/main.py
@@ -65,6 +65,7 @@ def require_controller_url(ctx: click.Context) -> str:
 
         iris_config = IrisConfig(config)
         platform = iris_config.platform()
+        ctx.obj["platform"] = platform
 
         if iris_config.proto.controller.WhichOneof("controller") == "local":
             from iris.cluster.controller.local import LocalController
diff --git a/lib/iris/src/iris/cluster/platform/base.py b/lib/iris/src/iris/cluster/platform/base.py
@@ -381,6 +381,14 @@ def tunnel(
         """
         ...
 
+    def debug_report(self) -> None:
+        """Log diagnostic info about the controller after a failure.
+
+        Override to inspect platform-specific state (e.g. pod termination
+        reason, previous container logs). Default is a no-op.
+        """
+        ...
+
     def shutdown(self) -> None:
         """Release platform-owned resources (threads, connections, caches).
 
diff --git a/lib/iris/src/iris/cluster/platform/coreweave.py b/lib/iris/src/iris/cluster/platform/coreweave.py
@@ -1270,6 +1270,36 @@ def _wait_for_deployment_ready(self) -> None:
     # a couple of attempts in case the first crash was transient.
     _CRASH_LOOP_MIN_RESTARTS = 2
 
+    def debug_report(self) -> None:
+        """Log controller pod termination reason and previous container logs."""
+        pods = self._kubectl.list_json("pods", labels={"app": "iris-controller"})
+        if not pods:
+            logger.warning("Post-mortem: no controller pods found")
+            return
+
+        for pod in pods:
+            name = pod.get("metadata", {}).get("name", "unknown")
+            phase = pod.get("status", {}).get("phase", "Unknown")
+
+            for cs in pod.get("status", {}).get("containerStatuses", []):
+                restarts = cs.get("restartCount", 0)
+                terminated = cs.get("lastState", {}).get("terminated", {})
+                if terminated:
+                    logger.warning(
+                        "Post-mortem %s: phase=%s reason=%s exitCode=%s restarts=%d",
+                        name,
+                        phase,
+                        terminated.get("reason"),
+                        terminated.get("exitCode"),
+                        restarts,
+                    )
+                else:
+                    logger.warning("Post-mortem %s: phase=%s restarts=%d", name, phase, restarts)
+
+            prev_logs = self._kubectl.logs(name, tail=50, previous=True)
+            if prev_logs:
+                logger.warning("Post-mortem %s previous logs:\n%s", name, prev_logs)
+
     def _check_controller_pods_health(self) -> None:
         """Check controller Pods for fatal conditions and fail fast.