[Feat] Allow using x-litellm-stream-timeout header for stream timeout in requests (#14147)

ishaan-jaff · web-flow · commit 98d57b5d271a · 2025-09-01T15:59:14.000-07:00
* fix: allow passing stream_timeout header

* fix: _get_stream_timeout_from_request

* test_add_litellm_data_to_request_with_stream_timeout_header

* docs: LiteLLM Headers

* test_add_litellm_data_to_request_with_stream_timeout_header
diff --git a/docs/my-website/docs/proxy/request_headers.md b/docs/my-website/docs/proxy/request_headers.md
@@ -6,6 +6,8 @@ Special headers that are supported by LiteLLM.
 
 `x-litellm-timeout` Optional[float]: The timeout for the request in seconds.
 
+`x-litellm-stream-timeout` Optional[float]: The timeout for getting the first chunk of the response in seconds (only applies for streaming requests). [Demo Video](https://www.loom.com/share/8da67e4845ce431a98c901d4e45db0e5)
+
 `x-litellm-enable-message-redaction`: Optional[bool]: Don't log the message content to logging integrations. Just track spend. [Learn More](./logging#redact-messages-response-content)
 
 `x-litellm-tags`: Optional[str]: A comma separated list (e.g. `tag1,tag2,tag3`) of tags to use for [tag-based routing](./tag_routing) **OR** [spend-tracking](./enterprise.md#tracking-spend-for-custom-tags).
diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py
@@ -2904,6 +2904,7 @@ class LitellmDataForBackendLLMCall(TypedDict, total=False):
     headers: dict
     organization: str
     timeout: Optional[float]
+    stream_timeout: Optional[float]
     user: Optional[str]
     num_retries: Optional[int]
 
diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py
@@ -271,6 +271,16 @@ def _get_timeout_from_request(headers: dict) -> Optional[float]:
         if timeout_header is not None:
             return float(timeout_header)
         return None
+    
+    @staticmethod
+    def _get_stream_timeout_from_request(headers: dict) -> Optional[float]:
+        """
+        Get the `stream_timeout` from the request headers.
+        """
+        stream_timeout_header = headers.get("x-litellm-stream-timeout", None)
+        if stream_timeout_header is not None:
+            return float(stream_timeout_header)
+        return None
 
     @staticmethod
     def _get_num_retries_from_request(headers: dict) -> Optional[int]:
@@ -439,6 +449,10 @@ def add_litellm_data_for_backend_llm_call(
         timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
         if timeout is not None:
             data["timeout"] = timeout
+        
+        stream_timeout = LiteLLMProxyRequestSetup._get_stream_timeout_from_request(headers)
+        if stream_timeout is not None:
+            data["stream_timeout"] = stream_timeout
 
         num_retries = LiteLLMProxyRequestSetup._get_num_retries_from_request(headers)
         if num_retries is not None:
diff --git a/tests/test_litellm/proxy/test_common_request_processing.py b/tests/test_litellm/proxy/test_common_request_processing.py
@@ -74,6 +74,96 @@ async def mock_common_processing_pre_call_logic(
             pytest.fail("litellm_call_id is not a valid UUID")
         assert data_passed["litellm_call_id"] == returned_data["litellm_call_id"]
 
+    @pytest.mark.asyncio
+    async def test_stream_timeout_header_processing(self):
+        """
+        Test that x-litellm-stream-timeout header gets processed and added to request data as stream_timeout.
+        """
+        from litellm.proxy.litellm_pre_call_utils import LiteLLMProxyRequestSetup
+
+        # Test with stream timeout header
+        headers_with_timeout = {"x-litellm-stream-timeout": "30.5"}
+        result = LiteLLMProxyRequestSetup._get_stream_timeout_from_request(headers_with_timeout)
+        assert result == 30.5
+        
+        # Test without stream timeout header
+        headers_without_timeout = {}
+        result = LiteLLMProxyRequestSetup._get_stream_timeout_from_request(headers_without_timeout)
+        assert result is None
+        
+        # Test with invalid header value (should raise ValueError when converting to float)
+        headers_with_invalid = {"x-litellm-stream-timeout": "invalid"}
+        with pytest.raises(ValueError):
+            LiteLLMProxyRequestSetup._get_stream_timeout_from_request(headers_with_invalid)
+
+    @pytest.mark.asyncio
+    async def test_add_litellm_data_to_request_with_stream_timeout_header(self):
+        """
+        Test that x-litellm-stream-timeout header gets processed and added to request data 
+        when calling add_litellm_data_to_request.
+        """
+        from litellm.integrations.opentelemetry import UserAPIKeyAuth
+        from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
+
+        # Create test data with a basic completion request
+        test_data = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Hello"}]
+        }
+        
+        # Mock request with stream timeout header
+        mock_request = MagicMock(spec=Request)
+        mock_request.headers = {"x-litellm-stream-timeout": "45.0"}
+        mock_request.url.path = "/v1/chat/completions"
+        mock_request.method = "POST"
+        mock_request.query_params = {}
+        mock_request.client = None
+        
+        # Create a minimal mock with just the required attributes
+        mock_user_api_key_dict = MagicMock()
+        mock_user_api_key_dict.api_key = "test_api_key_hash"
+        mock_user_api_key_dict.tpm_limit = None
+        mock_user_api_key_dict.rpm_limit = None
+        mock_user_api_key_dict.max_budget = None
+        mock_user_api_key_dict.spend = 0
+        mock_user_api_key_dict.allowed_model_region = None
+        mock_user_api_key_dict.key_alias = None
+        mock_user_api_key_dict.user_id = None
+        mock_user_api_key_dict.team_id = None
+        mock_user_api_key_dict.metadata = {}  # Prevent enterprise feature check
+        mock_user_api_key_dict.team_metadata = None
+        mock_user_api_key_dict.org_id = None
+        mock_user_api_key_dict.team_alias = None
+        mock_user_api_key_dict.end_user_id = None
+        mock_user_api_key_dict.user_email = None
+        mock_user_api_key_dict.request_route = None
+        mock_user_api_key_dict.team_max_budget = None
+        mock_user_api_key_dict.team_spend = None
+        mock_user_api_key_dict.model_max_budget = None
+        mock_user_api_key_dict.parent_otel_span = None
+        mock_user_api_key_dict.team_model_aliases = None
+        
+        general_settings = {}
+        mock_proxy_config = MagicMock()
+        
+        # Call the actual function that processes headers and adds data
+        result_data = await add_litellm_data_to_request(
+            data=test_data,
+            request=mock_request,
+            general_settings=general_settings,
+            user_api_key_dict=mock_user_api_key_dict,
+            version=None,
+            proxy_config=mock_proxy_config,
+        )
+        
+        # Verify that stream_timeout was extracted from header and added to request data
+        assert "stream_timeout" in result_data
+        assert result_data["stream_timeout"] == 45.0
+        
+        # Verify that the original test data is preserved
+        assert result_data["model"] == "gpt-3.5-turbo"
+        assert result_data["messages"] == [{"role": "user", "content": "Hello"}]
+
 
 @pytest.mark.asyncio
 class TestCommonRequestProcessingHelpers: