feat: Add FallbackChatGenerator (#9859)

vblagoje · sjrl · Amnah199 · web-flow · commit 90edcdaceefc · 2025-10-17T15:46:31.000+02:00
* Add FallbackChatGenerator

* Update licence files

* Use typing.Optional/Union for Python 3.9 compat

* Use the right logger

* Lint fix

* PR review

* Rewrite release note

* Add FallbackChatGenerator to docs

* Update haystack/components/generators/chat/fallback.py

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Rename generator -&gt; chat_generators

* Lint

* Rename generators -&gt; chat_generators in meta, docs, tests

* Update haystack/components/generators/chat/fallback.py

Co-authored-by: Amna Mubashar &lt;amnahkhan.ak@gmail.com&gt;

* Update pydocs

* Minor pydocs fix

---------

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;
Co-authored-by: Amna Mubashar &lt;amnahkhan.ak@gmail.com&gt;
diff --git a/docs/pydoc/config/generators_api.yml b/docs/pydoc/config/generators_api.yml
@@ -12,6 +12,7 @@ loaders:
         "chat/hugging_face_local",
         "chat/hugging_face_api",
         "chat/openai",
+        "chat/fallback",
       ]
     ignore_when_discovered: ["__init__"]
 processors:
diff --git a/docs/pydoc/config_docusaurus/generators_api.yml b/docs/pydoc/config_docusaurus/generators_api.yml
@@ -12,6 +12,7 @@ loaders:
         "chat/hugging_face_local",
         "chat/hugging_face_api",
         "chat/openai",
+        "chat/fallback",
       ]
     ignore_when_discovered: ["__init__"]
 processors:
diff --git a/haystack/components/generators/chat/__init__.py b/haystack/components/generators/chat/__init__.py
@@ -12,10 +12,12 @@
     "azure": ["AzureOpenAIChatGenerator"],
     "hugging_face_local": ["HuggingFaceLocalChatGenerator"],
     "hugging_face_api": ["HuggingFaceAPIChatGenerator"],
+    "fallback": ["FallbackChatGenerator"],
 }
 
 if TYPE_CHECKING:
     from .azure import AzureOpenAIChatGenerator as AzureOpenAIChatGenerator
+    from .fallback import FallbackChatGenerator as FallbackChatGenerator
     from .hugging_face_api import HuggingFaceAPIChatGenerator as HuggingFaceAPIChatGenerator
     from .hugging_face_local import HuggingFaceLocalChatGenerator as HuggingFaceLocalChatGenerator
     from .openai import OpenAIChatGenerator as OpenAIChatGenerator
diff --git a/haystack/components/generators/chat/fallback.py b/haystack/components/generators/chat/fallback.py
@@ -0,0 +1,223 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any, Union
+
+from haystack import component, default_from_dict, default_to_dict, logging
+from haystack.components.generators.chat.types import ChatGenerator
+from haystack.dataclasses import ChatMessage, StreamingCallbackT
+from haystack.tools import Tool, Toolset
+from haystack.utils.deserialization import deserialize_component_inplace
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class FallbackChatGenerator:
+    """
+    A chat generator wrapper that tries multiple chat generators sequentially.
+
+    It forwards all parameters transparently to the underlying chat generators and returns the first successful result.
+    Calls chat generators sequentially until one succeeds. Falls back on any exception raised by a generator.
+    If all chat generators fail, it raises a RuntimeError with details.
+
+    Timeout enforcement is fully delegated to the underlying chat generators. The fallback mechanism will only
+    work correctly if the underlying chat generators implement proper timeout handling and raise exceptions
+    when timeouts occur. For predictable latency guarantees, ensure your chat generators:
+    - Support a `timeout` parameter in their initialization
+    - Implement timeout as total wall-clock time (shared deadline for both streaming and non-streaming)
+    - Raise timeout exceptions (e.g., TimeoutError, asyncio.TimeoutError, httpx.TimeoutException) when exceeded
+
+    Note: Most well-implemented chat generators (OpenAI, Anthropic, Cohere, etc.) support timeout parameters
+    with consistent semantics. For HTTP-based LLM providers, a single timeout value (e.g., `timeout=30`)
+    typically applies to all connection phases: connection setup, read, write, and pool. For streaming
+    responses, read timeout is the maximum gap between chunks. For non-streaming, it's the time limit for
+    receiving the complete response.
+
+    Failover is automatically triggered when a generator raises any exception, including:
+    - Timeout errors (if the generator implements and raises them)
+    - Rate limit errors (429)
+    - Authentication errors (401)
+    - Context length errors (400)
+    - Server errors (500+)
+    - Any other exception
+    """
+
+    def __init__(self, chat_generators: list[ChatGenerator]):
+        """
+        Creates an instance of FallbackChatGenerator.
+
+        :param chat_generators: A non-empty list of chat generator components to try in order.
+        """
+        if not chat_generators:
+            msg = "'chat_generators' must be a non-empty list"
+            raise ValueError(msg)
+
+        self.chat_generators = list(chat_generators)
+
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize the component, including nested chat generators when they support serialization."""
+        return default_to_dict(
+            self, chat_generators=[gen.to_dict() for gen in self.chat_generators if hasattr(gen, "to_dict")]
+        )
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> FallbackChatGenerator:
+        """Rebuild the component from a serialized representation, restoring nested chat generators."""
+        # Reconstruct nested chat generators from their serialized dicts
+        init_params = data.get("init_parameters", {})
+        serialized = init_params.get("chat_generators") or []
+        deserialized: list[Any] = []
+        for g in serialized:
+            # Use the generic component deserializer available in Haystack
+            holder = {"component": g}
+            deserialize_component_inplace(holder, key="component")
+            deserialized.append(holder["component"])
+        init_params["chat_generators"] = deserialized
+        data["init_parameters"] = init_params
+        return default_from_dict(cls, data)
+
+    def _run_single_sync(  # pylint: disable=too-many-positional-arguments
+        self,
+        gen: Any,
+        messages: list[ChatMessage],
+        generation_kwargs: Union[dict[str, Any], None],
+        tools: Union[list[Tool], Toolset, None],
+        streaming_callback: Union[StreamingCallbackT, None],
+    ) -> dict[str, Any]:
+        return gen.run(
+            messages=messages, generation_kwargs=generation_kwargs, tools=tools, streaming_callback=streaming_callback
+        )
+
+    async def _run_single_async(  # pylint: disable=too-many-positional-arguments
+        self,
+        gen: Any,
+        messages: list[ChatMessage],
+        generation_kwargs: Union[dict[str, Any], None],
+        tools: Union[list[Tool], Toolset, None],
+        streaming_callback: Union[StreamingCallbackT, None],
+    ) -> dict[str, Any]:
+        if hasattr(gen, "run_async") and callable(gen.run_async):
+            return await gen.run_async(
+                messages=messages,
+                generation_kwargs=generation_kwargs,
+                tools=tools,
+                streaming_callback=streaming_callback,
+            )
+        return await asyncio.to_thread(
+            gen.run,
+            messages=messages,
+            generation_kwargs=generation_kwargs,
+            tools=tools,
+            streaming_callback=streaming_callback,
+        )
+
+    @component.output_types(replies=list[ChatMessage], meta=dict[str, Any])
+    def run(
+        self,
+        messages: list[ChatMessage],
+        generation_kwargs: Union[dict[str, Any], None] = None,
+        tools: Union[list[Tool], Toolset, None] = None,
+        streaming_callback: Union[StreamingCallbackT, None] = None,
+    ) -> dict[str, Any]:
+        """
+        Execute chat generators sequentially until one succeeds.
+
+        :param messages: The conversation history as a list of ChatMessage instances.
+        :param generation_kwargs: Optional parameters for the chat generator (e.g., temperature, max_tokens).
+        :param tools: Optional Tool instances or Toolset for function calling capabilities.
+        :param streaming_callback: Optional callable for handling streaming responses.
+        :returns: A dictionary with:
+            - "replies": Generated ChatMessage instances from the first successful generator.
+            - "meta": Execution metadata including successful_chat_generator_index, successful_chat_generator_class,
+              total_attempts, failed_chat_generators, plus any metadata from the successful generator.
+        :raises RuntimeError: If all chat generators fail.
+        """
+        failed: list[str] = []
+        last_error: Union[BaseException, None] = None
+
+        for idx, gen in enumerate(self.chat_generators):
+            gen_name = gen.__class__.__name__
+            try:
+                result = self._run_single_sync(gen, messages, generation_kwargs, tools, streaming_callback)
+                replies = result.get("replies", [])
+                meta = dict(result.get("meta", {}))
+                meta.update(
+                    {
+                        "successful_chat_generator_index": idx,
+                        "successful_chat_generator_class": gen_name,
+                        "total_attempts": idx + 1,
+                        "failed_chat_generators": failed,
+                    }
+                )
+                return {"replies": replies, "meta": meta}
+            except Exception as e:  # noqa: BLE001 - fallback logic should handle any exception
+                logger.warning(
+                    "ChatGenerator {chat_generator} failed with error: {error}", chat_generator=gen_name, error=e
+                )
+                failed.append(gen_name)
+                last_error = e
+
+        failed_names = ", ".join(failed)
+        msg = (
+            f"All {len(self.chat_generators)} chat generators failed. "
+            f"Last error: {last_error}. Failed chat generators: [{failed_names}]"
+        )
+        raise RuntimeError(msg)
+
+    @component.output_types(replies=list[ChatMessage], meta=dict[str, Any])
+    async def run_async(
+        self,
+        messages: list[ChatMessage],
+        generation_kwargs: Union[dict[str, Any], None] = None,
+        tools: Union[list[Tool], Toolset, None] = None,
+        streaming_callback: Union[StreamingCallbackT, None] = None,
+    ) -> dict[str, Any]:
+        """
+        Asynchronously execute chat generators sequentially until one succeeds.
+
+        :param messages: The conversation history as a list of ChatMessage instances.
+        :param generation_kwargs: Optional parameters for the chat generator (e.g., temperature, max_tokens).
+        :param tools: Optional Tool instances or Toolset for function calling capabilities.
+        :param streaming_callback: Optional callable for handling streaming responses.
+        :returns: A dictionary with:
+            - "replies": Generated ChatMessage instances from the first successful generator.
+            - "meta": Execution metadata including successful_chat_generator_index, successful_chat_generator_class,
+              total_attempts, failed_chat_generators, plus any metadata from the successful generator.
+        :raises RuntimeError: If all chat generators fail.
+        """
+        failed: list[str] = []
+        last_error: Union[BaseException, None] = None
+
+        for idx, gen in enumerate(self.chat_generators):
+            gen_name = gen.__class__.__name__
+            try:
+                result = await self._run_single_async(gen, messages, generation_kwargs, tools, streaming_callback)
+                replies = result.get("replies", [])
+                meta = dict(result.get("meta", {}))
+                meta.update(
+                    {
+                        "successful_chat_generator_index": idx,
+                        "successful_chat_generator_class": gen_name,
+                        "total_attempts": idx + 1,
+                        "failed_chat_generators": failed,
+                    }
+                )
+                return {"replies": replies, "meta": meta}
+            except Exception as e:  # noqa: BLE001 - fallback logic should handle any exception
+                logger.warning(
+                    "ChatGenerator {chat_generator} failed with error: {error}", chat_generator=gen_name, error=e
+                )
+                failed.append(gen_name)
+                last_error = e
+
+        failed_names = ", ".join(failed)
+        msg = (
+            f"All {len(self.chat_generators)} chat generators failed. "
+            f"Last error: {last_error}. Failed chat generators: [{failed_names}]"
+        )
+        raise RuntimeError(msg)
diff --git a/releasenotes/notes/add-fallback-chat-generator-ffe557ca01fcdaca.yaml b/releasenotes/notes/add-fallback-chat-generator-ffe557ca01fcdaca.yaml
@@ -0,0 +1,6 @@
+---
+highlights: >
+  Introduced `FallbackChatGenerator` that tries multiple chat providers one by one, improving reliability in production and making sure you get answers even when some provider fails.
+features:
+  - |
+    Added `FallbackChatGenerator` that automatically retries different chat generators and returns first successful response with detailed information about which providers were tried.
diff --git a/test/components/generators/chat/test_fallback.py b/test/components/generators/chat/test_fallback.py

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@ loaders:`
`12`	`12`	`"chat/hugging_face_local",`
`13`	`13`	`"chat/hugging_face_api",`
`14`	`14`	`"chat/openai",`
	`15`	`+ "chat/fallback",`
`15`	`16`	`]`
`16`	`17`	`ignore_when_discovered: ["__init__"]`
`17`	`18`	`processors:`