feat(genai): add timeout and max_retries handling in chat methods (#1180)

mdrxy · web-flow · commit 5f9a80b098a0 · 2025-09-16T20:55:23.000-04:00
Fixes #731 Ensures instance-level (model defined) `timeout` and `max_retries` params are used if none are provided in the invocation.
diff --git a/libs/genai/langchain_google_genai/chat_models.py b/libs/genai/langchain_google_genai/chat_models.py
@@ -220,10 +220,10 @@ def _chat_with_retry(**kwargs: Any) -> Any:
             raise ChatGoogleGenerativeAIError(msg) from e
         except ResourceExhausted as e:
             # Handle quota-exceeded error with recommended retry delay
-            if hasattr(e, "retry_after") and e.retry_after < kwargs.get(
+            if hasattr(e, "retry_after") and getattr(e, "retry_after", 0) < kwargs.get(
                 "wait_exponential_max", 60.0
             ):
-                time.sleep(e.retry_after)
+                time.sleep(getattr(e, "retry_after"))
             raise
         except Exception:
             raise
@@ -267,6 +267,13 @@ async def _achat_with_retry(**kwargs: Any) -> Any:
             # Do not retry for these errors.
             msg = f"Invalid argument provided to Gemini: {e}"
             raise ChatGoogleGenerativeAIError(msg) from e
+        except ResourceExhausted as e:
+            # Handle quota-exceeded error with recommended retry delay
+            if hasattr(e, "retry_after") and getattr(e, "retry_after", 0) < kwargs.get(
+                "wait_exponential_max", 60.0
+            ):
+                time.sleep(getattr(e, "retry_after"))
+            raise
         except Exception:
             raise
 
@@ -1776,6 +1783,10 @@ def _generate(
             tool_choice=tool_choice,
             **kwargs,
         )
+        if self.timeout is not None and "timeout" not in kwargs:
+            kwargs["timeout"] = self.timeout
+        if "max_retries" not in kwargs:
+            kwargs["max_retries"] = self.max_retries
         response: GenerateContentResponse = _chat_with_retry(
             request=request,
             **kwargs,
@@ -1824,6 +1835,10 @@ async def _agenerate(
             tool_choice=tool_choice,
             **kwargs,
         )
+        if self.timeout is not None and "timeout" not in kwargs:
+            kwargs["timeout"] = self.timeout
+        if "max_retries" not in kwargs:
+            kwargs["max_retries"] = self.max_retries
         response: GenerateContentResponse = await _achat_with_retry(
             request=request,
             **kwargs,
@@ -1859,6 +1874,10 @@ def _stream(
             tool_choice=tool_choice,
             **kwargs,
         )
+        if self.timeout is not None and "timeout" not in kwargs:
+            kwargs["timeout"] = self.timeout
+        if "max_retries" not in kwargs:
+            kwargs["max_retries"] = self.max_retries
         response: GenerateContentResponse = _chat_with_retry(
             request=request,
             generation_method=self.client.stream_generate_content,
@@ -1925,6 +1944,10 @@ async def _astream(
                 tool_choice=tool_choice,
                 **kwargs,
             )
+            if self.timeout is not None and "timeout" not in kwargs:
+                kwargs["timeout"] = self.timeout
+            if "max_retries" not in kwargs:
+                kwargs["max_retries"] = self.max_retries
             prev_usage_metadata: UsageMetadata | None = None  # cumulative usage
             async for chunk in await _achat_with_retry(
                 request=request,
diff --git a/libs/genai/tests/unit_tests/test_chat_models.py b/libs/genai/tests/unit_tests/test_chat_models.py
@@ -4,6 +4,7 @@
 import base64
 import json
 import warnings
+from collections.abc import Iterator
 from concurrent.futures import ThreadPoolExecutor
 from typing import Optional, Union
 from unittest.mock import ANY, Mock, patch
@@ -20,6 +21,7 @@
 from langchain_core.load import dumps, loads
 from langchain_core.messages import (
     AIMessage,
+    BaseMessage,
     FunctionMessage,
     HumanMessage,
     SystemMessage,
@@ -917,3 +919,198 @@ def test_response_to_result_grounding_metadata(
             else {}
         )
         assert grounding_metadata == expected_grounding_metadata
+
+
+@pytest.mark.parametrize(
+    "is_async,mock_target,method_name",
+    [
+        (False, "_chat_with_retry", "_generate"),  # Sync
+        (True, "_achat_with_retry", "_agenerate"),  # Async
+    ],
+)
+@pytest.mark.parametrize(
+    "instance_timeout,call_timeout,expected_timeout,should_have_timeout",
+    [
+        (5.0, None, 5.0, True),  # Instance-level timeout
+        (5.0, 10.0, 10.0, True),  # Call-level overrides instance
+        (None, None, None, False),  # No timeout anywhere
+    ],
+)
+async def test_timeout_parameter_handling(
+    is_async: bool,
+    mock_target: str,
+    method_name: str,
+    instance_timeout: Optional[float],
+    call_timeout: Optional[float],
+    expected_timeout: Optional[float],
+    should_have_timeout: bool,
+) -> None:
+    """Test timeout parameter handling for sync and async methods."""
+    with patch(f"langchain_google_genai.chat_models.{mock_target}") as mock_retry:
+        mock_retry.return_value = GenerateContentResponse(
+            {
+                "candidates": [
+                    {
+                        "content": {"parts": [{"text": "Test response"}]},
+                        "finish_reason": "STOP",
+                    }
+                ]
+            }
+        )
+
+        # Create LLM with optional instance-level timeout
+        llm_kwargs = {
+            "model": "gemini-2.5-flash",
+            "google_api_key": SecretStr("test-key"),
+        }
+        if instance_timeout is not None:
+            llm_kwargs["timeout"] = instance_timeout
+
+        llm = ChatGoogleGenerativeAI(**llm_kwargs)
+        messages: list[BaseMessage] = [HumanMessage(content="Hello")]
+
+        # Call the appropriate method with optional call-level timeout
+        method = getattr(llm, method_name)
+        call_kwargs = {}
+        if call_timeout is not None:
+            call_kwargs["timeout"] = call_timeout
+
+        if is_async:
+            await method(messages, **call_kwargs)
+        else:
+            method(messages, **call_kwargs)
+
+        # Verify timeout was passed correctly
+        mock_retry.assert_called_once()
+        call_kwargs_actual = mock_retry.call_args[1]
+
+        if should_have_timeout:
+            assert "timeout" in call_kwargs_actual
+            assert call_kwargs_actual["timeout"] == expected_timeout
+        else:
+            assert "timeout" not in call_kwargs_actual
+
+
+@pytest.mark.parametrize(
+    "instance_timeout,expected_timeout,should_have_timeout",
+    [
+        (5.0, 5.0, True),  # Instance-level timeout
+        (None, None, False),  # No timeout
+    ],
+)
+@patch("langchain_google_genai.chat_models._chat_with_retry")
+def test_timeout_streaming_parameter_handling(
+    mock_retry: Mock,
+    instance_timeout: Optional[float],
+    expected_timeout: Optional[float],
+    should_have_timeout: bool,
+) -> None:
+    """Test timeout parameter handling for streaming methods."""
+
+    # Mock the return value for _chat_with_retry to return an iterator
+    def mock_stream() -> Iterator[GenerateContentResponse]:
+        yield GenerateContentResponse(
+            {
+                "candidates": [
+                    {
+                        "content": {"parts": [{"text": "chunk1"}]},
+                        "finish_reason": "STOP",
+                    }
+                ]
+            }
+        )
+
+    mock_retry.return_value = mock_stream()
+
+    # Create LLM with optional instance-level timeout
+    llm_kwargs = {
+        "model": "gemini-2.5-flash",
+        "google_api_key": SecretStr("test-key"),
+    }
+    if instance_timeout is not None:
+        llm_kwargs["timeout"] = instance_timeout
+
+    llm = ChatGoogleGenerativeAI(**llm_kwargs)
+
+    # Call _stream (which should pass timeout to _chat_with_retry)
+    messages: list[BaseMessage] = [HumanMessage(content="Hello")]
+    list(llm._stream(messages))  # Convert generator to list to trigger execution
+
+    # Verify timeout was passed correctly
+    mock_retry.assert_called_once()
+    call_kwargs = mock_retry.call_args[1]
+
+    if should_have_timeout:
+        assert "timeout" in call_kwargs
+        assert call_kwargs["timeout"] == expected_timeout
+    else:
+        assert "timeout" not in call_kwargs
+
+
+@pytest.mark.parametrize(
+    "is_async,mock_target,method_name",
+    [
+        (False, "_chat_with_retry", "_generate"),  # Sync
+        (True, "_achat_with_retry", "_agenerate"),  # Async
+    ],
+)
+@pytest.mark.parametrize(
+    "instance_max_retries,call_max_retries,expected_max_retries,should_have_max_retries",
+    [
+        (1, None, 1, True),  # Instance-level max_retries
+        (3, 5, 5, True),  # Call-level overrides instance
+        (6, None, 6, True),  # Default instance value
+    ],
+)
+async def test_max_retries_parameter_handling(
+    is_async: bool,
+    mock_target: str,
+    method_name: str,
+    instance_max_retries: int,
+    call_max_retries: Optional[int],
+    expected_max_retries: int,
+    should_have_max_retries: bool,
+) -> None:
+    """Test max_retries parameter handling for sync and async methods."""
+    with patch(f"langchain_google_genai.chat_models.{mock_target}") as mock_retry:
+        mock_retry.return_value = GenerateContentResponse(
+            {
+                "candidates": [
+                    {
+                        "content": {"parts": [{"text": "Test response"}]},
+                        "finish_reason": "STOP",
+                    }
+                ]
+            }
+        )
+
+        # Instance-level max_retries
+        llm_kwargs = {
+            "model": "gemini-2.5-flash",
+            "google_api_key": SecretStr("test-key"),
+            "max_retries": instance_max_retries,
+        }
+
+        llm = ChatGoogleGenerativeAI(**llm_kwargs)
+        messages: list[BaseMessage] = [HumanMessage(content="Hello")]
+
+        # Call the appropriate method with optional call-level max_retries
+        method = getattr(llm, method_name)
+        call_kwargs = {}
+        if call_max_retries is not None:
+            call_kwargs["max_retries"] = call_max_retries
+
+        if is_async:
+            await method(messages, **call_kwargs)
+        else:
+            method(messages, **call_kwargs)
+
+        # Verify max_retries was passed correctly
+        mock_retry.assert_called_once()
+        call_kwargs_actual = mock_retry.call_args[1]
+
+        if should_have_max_retries:
+            assert "max_retries" in call_kwargs_actual
+            assert call_kwargs_actual["max_retries"] == expected_max_retries
+        else:
+            assert "max_retries" not in call_kwargs_actual