microsoft · Vasuk12 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/agentlightning/runner/agent.py b/agentlightning/runner/agent.py
@@ -33,6 +33,7 @@
 from agentlightning.litagent import LitAgent
 from agentlightning.reward import emit_reward, find_final_reward
 from agentlightning.store.base import LightningStore
+from agentlightning.store.client_server import ServerShutdownError
 from agentlightning.tracer.agentops import AgentOpsTracer
 from agentlightning.tracer.base import Tracer
 from agentlightning.types import (
@@ -289,7 +290,14 @@ async def _post_process_rollout_result(
             # This will NOT emit another span to the tracer
             reward_span = emit_reward(raw_result, propagate=False)
             # We add it to the store manually
-            await store.add_otel_span(rollout.rollout_id, rollout.attempt.attempt_id, reward_span)
+            try:
+                await store.add_otel_span(rollout.rollout_id, rollout.attempt.attempt_id, reward_span)
+            except ServerShutdownError:
+                # Server is shutting down - handle gracefully without traceback
+                logger.debug(
+                    f"{self._log_prefix(rollout.rollout_id)} Server is shutting down. "
+                    "Skipping add_otel_span for reward span."
+                )
             trace_spans.append(reward_span)
 
         if isinstance(raw_result, list):
@@ -304,9 +312,16 @@ async def _post_process_rollout_result(
                     self._tracer, AgentOpsTracer
                 ):  # TODO: this should be replaced with general OpenTelemetry tracer in next version
                     for span in raw_result:
-                        await store.add_otel_span(
-                            rollout.rollout_id, rollout.attempt.attempt_id, cast(ReadableSpan, span)
-                        )
+                        try:
+                            await store.add_otel_span(
+                                rollout.rollout_id, rollout.attempt.attempt_id, cast(ReadableSpan, span)
+                            )
+                        except ServerShutdownError:
+                            # Server is shutting down - handle gracefully without traceback
+                            logger.debug(
+                                f"{self._log_prefix(rollout.rollout_id)} Server is shutting down. "
+                                f"Skipping add_otel_span for span: {span.name}"
+                            )
                 else:
                     logger.warning(
                         f"{self._log_prefix(rollout.rollout_id)} Tracer is already an OpenTelemetry tracer. "
@@ -528,6 +543,9 @@ async def _step_impl(self, next_rollout: AttemptedRollout, raise_on_exception: b
                     await store.update_attempt(rollout_id, next_rollout.attempt.attempt_id, status="failed")
                 else:
                     await store.update_attempt(rollout_id, next_rollout.attempt.attempt_id, status="succeeded")
+            except ServerShutdownError:
+                # Server is shutting down - handle gracefully without traceback
+                logger.debug(f"{self._log_prefix(rollout_id)} Server is shutting down. " "Skipping update_attempt.")
             except Exception:
                 logger.exception(
                     f"{self._log_prefix(rollout_id)} Exception during update_attempt. Giving up the update."
@@ -582,6 +600,12 @@ async def iter(self, *, event: Optional[ExecutionEvent] = None) -> None:
                     await store.update_attempt(
                         next_rollout.rollout_id, next_rollout.attempt.attempt_id, worker_id=self.get_worker_id()
                     )
+                except ServerShutdownError:
+                    # Server is shutting down - handle gracefully without traceback
+                    logger.debug(
+                        f"{self._log_prefix()} Server is shutting down. " "Skipping update_attempt for rollout claim."
+                    )
+                    continue
                 except Exception:
                     # This exception could happen if the rollout is dequeued and the other end died for some reason
                     logger.exception(f"{self._log_prefix()} Exception during update_attempt, giving up the rollout.")

diff --git a/agentlightning/store/client_server.py b/agentlightning/store/client_server.py
@@ -75,6 +75,15 @@
 T_model = TypeVar("T_model", bound=BaseModel)
 
 
+class ServerShutdownError(Exception):
+    """Raised when the server is shutting down and requests cannot be completed.
+
+    This exception is raised instead of ServerDisconnectedError when we detect
+    that the server is permanently unavailable (e.g., during graceful shutdown).
+    Callers should handle this gracefully without dumping full tracebacks.
+    """
+
+
 class RolloutRequest(BaseModel):
     input: TaskInput
     mode: Optional[Literal["train", "val", "test"]] = None
@@ -1238,6 +1247,9 @@ def __init__(
         self._dequeue_was_successful: bool = False
         self._dequeue_first_unsuccessful: bool = True
 
+        # Track server shutdown state to handle errors gracefully
+        self._server_shutting_down: bool = False
+
     @property
     def capabilities(self) -> LightningStoreCapabilities:
         """Return the capabilities of the store."""
@@ -1287,6 +1299,7 @@ def __setstate__(self, state: Dict[str, Any]):
         self._connection_timeout = state["_connection_timeout"]
         self._dequeue_was_successful = False
         self._dequeue_first_unsuccessful = True
+        self._server_shutting_down = False
 
     async def _get_session(self) -> aiohttp.ClientSession:
         # In the proxy process, FastAPI middleware calls
@@ -1324,6 +1337,7 @@ async def _wait_until_healthy(self, session: aiohttp.ClientSession) -> bool:
         """
         Probe the server's /health until it responds 200 or retries are exhausted.
         Returns True if healthy, False otherwise.
+        When this returns False, it indicates the server is shutting down or permanently unavailable.
         """
         if not self._health_retry_delays:
             client_logger.info("No health retry delays configured; skipping health checks.")
@@ -1342,9 +1356,12 @@ async def _wait_until_healthy(self, session: aiohttp.ClientSession) -> bool:
                     client_logger.warning(f"Server is not healthy yet. Retrying in {delay} seconds.")
             if delay > 0.0:
                 await asyncio.sleep(delay)
-        client_logger.error(
-            f"Server is not healthy at {self.server_address}/health after {len(self._health_retry_delays)} retry attempts"
+        client_logger.warning(
+            f"Server is not healthy at {self.server_address}/health after {len(self._health_retry_delays)} retry attempts. "
+            "Server appears to be shutting down."
         )
+        # Mark server as shutting down to handle subsequent errors gracefully
+        self._server_shutting_down = True
         return False
 
     async def _request_json(
@@ -1405,6 +1422,15 @@ async def _request_json(
                 last_exc = net_exc
                 client_logger.info(f"Network/session issue will be retried. Retrying the request {method}: {path}")
                 if not await self._wait_until_healthy(session):
+                    # Server is shutting down - handle ServerDisconnectedError gracefully
+                    if isinstance(net_exc, aiohttp.ServerDisconnectedError) and self._server_shutting_down:
+                        client_logger.debug(
+                            f"Server is shutting down. Suppressing ServerDisconnectedError for {method}: {path}"
+                        )
+                        # Raise a specific exception that callers can catch and handle gracefully
+                        raise ServerShutdownError(
+                            f"Server is shutting down. Request {method}: {path} cannot be completed."
+                        ) from net_exc
                     break  # server is not healthy, do not retry
 
         # exhausted retries