feat(langchain_v1): Add retry_model_request middleware hook, add ModelFallbackMiddleware (#33275)

nfcampos · sydney-runkle · web-flow · commit a9aa3f232dce · 2025-10-05T20:32:45.000Z
- retry_model_request hook lets a middleware decide to retry a failed
model request, with full ability to modify as much or as little of the
request before doing so
- ModelFallbackMiddleware tries each fallback model in order, until one
is successful, or fallback list is exhausted

Co-authored-by: Sydney Runkle &lt;54324534+sydney-runkle@users.noreply.github.com&gt;
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -2,7 +2,7 @@
   "permissions": {
     "allow": [
       "Bash(uv run:*)",
-      "Bash(make:*)"
+      "Bash(make:*)",
       "WebSearch",
       "WebFetch(domain:ai.pydantic.dev)",
       "WebFetch(domain:openai.github.io)",
@@ -12,4 +12,4 @@
     "deny": [],
     "ask": []
   }
-}
+}
diff --git a/libs/langchain_v1/langchain/agents/middleware/__init__.py b/libs/langchain_v1/langchain/agents/middleware/__init__.py
@@ -1,6 +1,7 @@
 """Middleware plugins for agents."""
 
 from .human_in_the_loop import HumanInTheLoopMiddleware
+from .model_fallback import ModelFallbackMiddleware
 from .pii import PIIDetectionError, PIIMiddleware
 from .planning import PlanningMiddleware
 from .prompt_caching import AnthropicPromptCachingMiddleware
@@ -25,6 +26,7 @@
     "AnthropicPromptCachingMiddleware",
     "HumanInTheLoopMiddleware",
     "LLMToolSelectorMiddleware",
+    "ModelFallbackMiddleware",
     "ModelRequest",
     "PIIDetectionError",
     "PIIMiddleware",
diff --git a/libs/langchain_v1/langchain/agents/middleware/model_fallback.py b/libs/langchain_v1/langchain/agents/middleware/model_fallback.py
@@ -0,0 +1,94 @@
+"""Model fallback middleware for agents."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from langchain.agents.middleware.types import AgentMiddleware, AgentState, ModelRequest
+from langchain.chat_models import init_chat_model
+
+if TYPE_CHECKING:
+    from langchain_core.language_models.chat_models import BaseChatModel
+    from langgraph.runtime import Runtime
+
+
+class ModelFallbackMiddleware(AgentMiddleware):
+    """Middleware that provides automatic model fallback on errors.
+
+    This middleware attempts to retry failed model calls with alternative models
+    in sequence. When a model call fails, it tries the next model in the fallback
+    list until either a call succeeds or all models have been exhausted.
+
+    Example:
+        ```python
+        from langchain.agents.middleware.model_fallback import ModelFallbackMiddleware
+        from langchain.agents import create_agent
+
+        # Create middleware with fallback models (not including primary)
+        fallback = ModelFallbackMiddleware(
+            "openai:gpt-4o-mini",  # First fallback
+            "anthropic:claude-3-5-sonnet-20241022",  # Second fallback
+        )
+
+        agent = create_agent(
+            model="openai:gpt-4o",  # Primary model
+            middleware=[fallback],
+        )
+
+        # If gpt-4o fails, automatically tries gpt-4o-mini, then claude
+        result = await agent.invoke({"messages": [HumanMessage("Hello")]})
+        ```
+    """
+
+    def __init__(
+        self,
+        first_model: str | BaseChatModel,
+        *additional_models: str | BaseChatModel,
+    ) -> None:
+        """Initialize the model fallback middleware.
+
+        Args:
+            first_model: The first fallback model to try when the primary model fails.
+                Can be a model name string or BaseChatModel instance.
+            *additional_models: Additional fallback models to try, in order.
+                Can be model name strings or BaseChatModel instances.
+        """
+        super().__init__()
+
+        # Initialize all fallback models
+        all_models = (first_model, *additional_models)
+        self.models: list[BaseChatModel] = []
+        for model in all_models:
+            if isinstance(model, str):
+                self.models.append(init_chat_model(model))
+            else:
+                self.models.append(model)
+
+    def retry_model_request(
+        self,
+        error: Exception,  # noqa: ARG002
+        request: ModelRequest,
+        state: AgentState,  # noqa: ARG002
+        runtime: Runtime,  # noqa: ARG002
+        attempt: int,
+    ) -> ModelRequest | None:
+        """Retry with the next fallback model.
+
+        Args:
+            error: The exception that occurred during model invocation.
+            request: The original model request that failed.
+            state: The current agent state.
+            runtime: The langgraph runtime.
+            attempt: The current attempt number (1-indexed).
+
+        Returns:
+            ModelRequest with the next fallback model, or None if all models exhausted.
+        """
+        # attempt 1 = primary model failed, try models[0] (first fallback)
+        fallback_index = attempt - 1
+        # All fallback models exhausted
+        if fallback_index >= len(self.models):
+            return None
+        # Try next fallback model
+        request.model = self.models[fallback_index]
+        return request
diff --git a/libs/langchain_v1/langchain/agents/middleware/types.py b/libs/langchain_v1/langchain/agents/middleware/types.py
@@ -167,6 +167,54 @@ async def aafter_model(
     ) -> dict[str, Any] | None:
         """Async logic to run after the model is called."""
 
+    def retry_model_request(
+        self,
+        error: Exception,  # noqa: ARG002
+        request: ModelRequest,  # noqa: ARG002
+        state: StateT,  # noqa: ARG002
+        runtime: Runtime[ContextT],  # noqa: ARG002
+        attempt: int,  # noqa: ARG002
+    ) -> ModelRequest | None:
+        """Logic to handle model invocation errors and optionally retry.
+
+        Args:
+            error: The exception that occurred during model invocation.
+            request: The original model request that failed.
+            state: The current agent state.
+            runtime: The langgraph runtime.
+            attempt: The current attempt number (1-indexed).
+
+        Returns:
+            ModelRequest: Modified request to retry with.
+            None: Propagate the error (re-raise).
+        """
+        return None
+
+    async def aretry_model_request(
+        self,
+        error: Exception,
+        request: ModelRequest,
+        state: StateT,
+        runtime: Runtime[ContextT],
+        attempt: int,
+    ) -> ModelRequest | None:
+        """Async logic to handle model invocation errors and optionally retry.
+
+        Args:
+            error: The exception that occurred during model invocation.
+            request: The original model request that failed.
+            state: The current agent state.
+            runtime: The langgraph runtime.
+            attempt: The current attempt number (1-indexed).
+
+        Returns:
+            ModelRequest: Modified request to retry with.
+            None: Propagate the error (re-raise).
+        """
+        return await run_in_executor(
+            None, self.retry_model_request, error, request, state, runtime, attempt
+        )
+
 
 class _CallableWithStateAndRuntime(Protocol[StateT_contra, ContextT]):
     """Callable with AgentState and Runtime as arguments."""
diff --git a/libs/langchain_v1/langchain/agents/middleware_agent.py b/libs/langchain_v1/langchain/agents/middleware_agent.py
@@ -278,6 +278,12 @@ def create_agent(  # noqa: PLR0915
         if m.__class__.after_model is not AgentMiddleware.after_model
         or m.__class__.aafter_model is not AgentMiddleware.aafter_model
     ]
+    middleware_w_retry = [
+        m
+        for m in middleware
+        if m.__class__.retry_model_request is not AgentMiddleware.retry_model_request
+        or m.__class__.aretry_model_request is not AgentMiddleware.aretry_model_request
+    ]
 
     state_schemas = {m.state_schema for m in middleware}
     state_schemas.add(AgentState)
@@ -526,18 +532,47 @@ def model_request(state: AgentState, runtime: Runtime[ContextT]) -> dict[str, An
                 )
                 raise TypeError(msg)
 
-        # Get the bound model (with auto-detection if needed)
-        model_, effective_response_format = _get_bound_model(request)
-        messages = request.messages
-        if request.system_prompt:
-            messages = [SystemMessage(request.system_prompt), *messages]
+        # Retry loop for model invocation with error handling
+        # Hard limit of 100 attempts to prevent infinite loops from buggy middleware
+        max_attempts = 100
+        for attempt in range(1, max_attempts + 1):
+            try:
+                # Get the bound model (with auto-detection if needed)
+                model_, effective_response_format = _get_bound_model(request)
+                messages = request.messages
+                if request.system_prompt:
+                    messages = [SystemMessage(request.system_prompt), *messages]
+
+                output = model_.invoke(messages)
+                return {
+                    "thread_model_call_count": state.get("thread_model_call_count", 0) + 1,
+                    "run_model_call_count": state.get("run_model_call_count", 0) + 1,
+                    **_handle_model_output(output, effective_response_format),
+                }
+            except Exception as error:
+                # Try retry_model_request on each middleware
+                for m in middleware_w_retry:
+                    if m.__class__.retry_model_request is not AgentMiddleware.retry_model_request:
+                        if retry_request := m.retry_model_request(
+                            error, request, state, runtime, attempt
+                        ):
+                            # Break on first middleware that wants to retry
+                            request = retry_request
+                            break
+                    else:
+                        msg = (
+                            f"No synchronous function provided for "
+                            f'{m.__class__.__name__}.aretry_model_request".'
+                            "\nEither initialize with a synchronous function or invoke"
+                            " via the async API (ainvoke, astream, etc.)"
+                        )
+                        raise TypeError(msg)
+                else:
+                    raise
 
-        output = model_.invoke(messages)
-        return {
-            "thread_model_call_count": state.get("thread_model_call_count", 0) + 1,
-            "run_model_call_count": state.get("run_model_call_count", 0) + 1,
-            **_handle_model_output(output, effective_response_format),
-        }
+        # If we exit the loop, max attempts exceeded
+        msg = f"Maximum retry attempts ({max_attempts}) exceeded"
+        raise RuntimeError(msg)
 
     async def amodel_request(state: AgentState, runtime: Runtime[ContextT]) -> dict[str, Any]:
         """Async model request handler with sequential middleware processing."""
@@ -554,18 +589,39 @@ async def amodel_request(state: AgentState, runtime: Runtime[ContextT]) -> dict[
         for m in middleware_w_modify_model_request:
             await m.amodify_model_request(request, state, runtime)
 
-        # Get the bound model (with auto-detection if needed)
-        model_, effective_response_format = _get_bound_model(request)
-        messages = request.messages
-        if request.system_prompt:
-            messages = [SystemMessage(request.system_prompt), *messages]
-
-        output = await model_.ainvoke(messages)
-        return {
-            "thread_model_call_count": state.get("thread_model_call_count", 0) + 1,
-            "run_model_call_count": state.get("run_model_call_count", 0) + 1,
-            **_handle_model_output(output, effective_response_format),
-        }
+        # Retry loop for model invocation with error handling
+        # Hard limit of 100 attempts to prevent infinite loops from buggy middleware
+        max_attempts = 100
+        for attempt in range(1, max_attempts + 1):
+            try:
+                # Get the bound model (with auto-detection if needed)
+                model_, effective_response_format = _get_bound_model(request)
+                messages = request.messages
+                if request.system_prompt:
+                    messages = [SystemMessage(request.system_prompt), *messages]
+
+                output = await model_.ainvoke(messages)
+                return {
+                    "thread_model_call_count": state.get("thread_model_call_count", 0) + 1,
+                    "run_model_call_count": state.get("run_model_call_count", 0) + 1,
+                    **_handle_model_output(output, effective_response_format),
+                }
+            except Exception as error:
+                # Try retry_model_request on each middleware
+                for m in middleware_w_retry:
+                    if retry_request := await m.aretry_model_request(
+                        error, request, state, runtime, attempt
+                    ):
+                        # Break on first middleware that wants to retry
+                        request = retry_request
+                        break
+                else:
+                    # If no middleware wants to retry, re-raise the error
+                    raise
+
+        # If we exit the loop, max attempts exceeded
+        msg = f"Maximum retry attempts ({max_attempts}) exceeded"
+        raise RuntimeError(msg)
 
     # Use sync or async based on model capabilities
     from langgraph._internal._runnable import RunnableCallable
diff --git a/libs/langchain_v1/tests/unit_tests/agents/test_middleware_agent.py b/libs/langchain_v1/tests/unit_tests/agents/test_middleware_agent.py