[VoiceLive] Release 1.2.0b2 with MCP fix (#44101)

xitzhang · Xiting Zhang · web-flow · commit acf59b6af55f · 2025-11-20T16:28:07.000-08:00
* [VoiceLive] Release 1.2.0b2 with MCP fix

* update codeowner

---------

Co-authored-by: Xiting Zhang &lt;xitzhang@microsoft.com&gt;
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -261,7 +261,7 @@
 
 # PRLabel: %Voice Live
 # ServiceLabel: %Voice Live %Service Attention
-/sdk/ai/azure-ai-voicelive/                                          @rhurey @xitzhang
+/sdk/ai/azure-ai-voicelive/                                          @rhurey @xitzhang @amber-yujueWang
 
 
 # PRLabel: %HDInsight
diff --git a/.vscode/cspell.json b/.vscode/cspell.json
@@ -2187,7 +2187,7 @@
     },
     {
       "filename": "sdk/ai/azure-ai-voicelive/**",
-      "words": ["viseme","VISEME","ulaw","ULAW","logprobs","pyaudio","PyAudio","libasound"]
+      "words": ["viseme","VISEME","ulaw","ULAW","logprobs","pyaudio","PyAudio","libasound","webrtc","WEBRTC"]
     }
   ],
   "allowCompoundWords": true
diff --git a/sdk/ai/azure-ai-voicelive/CHANGELOG.md b/sdk/ai/azure-ai-voicelive/CHANGELOG.md
@@ -1,14 +1,30 @@
 # Release History
 
-## 1.2.0b2 (Unreleased)
+## 1.2.0b2 (2025-11-20)
 
 ### Features Added
 
-### Breaking Changes
-
-### Bugs Fixed
-
-### Other Changes
+- **Enhanced Avatar Configuration**: Expanded avatar functionality with new configuration options:
+  - Added `AvatarConfigTypes` enum with support for `video-avatar` and `photo-avatar` types
+  - Added `PhotoAvatarBaseModes` enum for photo avatar base models (e.g., `vasa-1`)
+  - Added `AvatarOutputProtocol` enum for avatar streaming protocols (`webrtc`, `websocket`)
+  - Enhanced `AvatarConfig` model with new properties: `type`, `model`, and `output_protocol`
+- **Image Content Support**: Added support for image inputs in conversations:
+  - New `RequestImageContentPart` model for including images in requests
+  - New `RequestImageContentPartDetail` enum for controlling image detail levels (`auto`, `low`, `high`)
+  - Added `INPUT_IMAGE` to `ContentPartType` enum
+  - Enhanced token details models (`InputTokenDetails`, `CachedTokenDetails`) with `image_tokens` tracking
+- **Enhanced OpenAI Voices**: Added new OpenAI voice options:
+  - Added `marin` and `cedar` voices to `OpenAIVoiceName` enum
+- **Extended Azure Personal Voice Configuration**: Enhanced `AzurePersonalVoice` with additional customization options:
+  - Added support for custom lexicon via `custom_lexicon_url`
+  - Added `prefer_locales` for locale preferences
+  - Added `locale`, `style`, `pitch`, `rate`, and `volume` properties for fine-tuned voice control
+- **Enhanced MCP Server Events**: Added completion status events for MCP tool calls:
+  - `ServerEventResponseMcpCallInProgress` for tracking in-progress MCP calls
+  - `ServerEventResponseMcpCallCompleted` for successful MCP call completion
+  - `ServerEventResponseMcpCallFailed` for failed MCP calls
+- **Pre-generated Assistant Messages**: Added support for pre-generated assistant messages in `ResponseCreateParams` via the `pre_generated_assistant_message` property
 
 ## 1.2.0b1 (2025-11-14)
 
diff --git a/sdk/ai/azure-ai-voicelive/apiview-properties.json b/sdk/ai/azure-ai-voicelive/apiview-properties.json
@@ -61,6 +61,7 @@
         "azure.ai.voicelive.models.OutputTextContentPart": "VoiceLive.OutputTextContentPart",
         "azure.ai.voicelive.models.OutputTokenDetails": "VoiceLive.OutputTokenDetails",
         "azure.ai.voicelive.models.RequestAudioContentPart": "VoiceLive.RequestAudioContentPart",
+        "azure.ai.voicelive.models.RequestImageContentPart": "VoiceLive.RequestImageContentPart",
         "azure.ai.voicelive.models.RequestSession": "VoiceLive.RequestSession",
         "azure.ai.voicelive.models.RequestTextContentPart": "VoiceLive.RequestTextContentPart",
         "azure.ai.voicelive.models.Response": "VoiceLive.Response",
@@ -115,6 +116,9 @@
         "azure.ai.voicelive.models.ServerEventResponseFunctionCallArgumentsDone": "VoiceLive.ServerEventResponseFunctionCallArgumentsDone",
         "azure.ai.voicelive.models.ServerEventResponseMcpCallArgumentsDelta": "VoiceLive.ServerEventResponseMcpCallArgumentsDelta",
         "azure.ai.voicelive.models.ServerEventResponseMcpCallArgumentsDone": "VoiceLive.ServerEventResponseMcpCallArgumentsDone",
+        "azure.ai.voicelive.models.ServerEventResponseMcpCallCompleted": "VoiceLive.ServerEventResponseMcpCallCompleted",
+        "azure.ai.voicelive.models.ServerEventResponseMcpCallFailed": "VoiceLive.ServerEventResponseMcpCallFailed",
+        "azure.ai.voicelive.models.ServerEventResponseMcpCallInProgress": "VoiceLive.ServerEventResponseMcpCallInProgress",
         "azure.ai.voicelive.models.ServerEventResponseOutputItemAdded": "VoiceLive.ServerEventResponseOutputItemAdded",
         "azure.ai.voicelive.models.ServerEventResponseOutputItemDone": "VoiceLive.ServerEventResponseOutputItemDone",
         "azure.ai.voicelive.models.ServerEventResponseTextDelta": "VoiceLive.ServerEventResponseTextDelta",
@@ -149,10 +153,14 @@
         "azure.ai.voicelive.models.InputAudioFormat": "VoiceLive.InputAudioFormat",
         "azure.ai.voicelive.models.TurnDetectionType": "VoiceLive.TurnDetectionType",
         "azure.ai.voicelive.models.EouThresholdLevel": "VoiceLive.EouThresholdLevel",
+        "azure.ai.voicelive.models.AvatarConfigTypes": "VoiceLive.AvatarConfigTypes",
+        "azure.ai.voicelive.models.PhotoAvatarBaseModes": "VoiceLive.PhotoAvatarBaseModes",
+        "azure.ai.voicelive.models.AvatarOutputProtocol": "VoiceLive.AvatarOutputProtocol",
         "azure.ai.voicelive.models.AudioTimestampType": "VoiceLive.AudioTimestampType",
         "azure.ai.voicelive.models.ToolChoiceLiteral": "VoiceLive.ToolChoiceLiteral",
         "azure.ai.voicelive.models.ResponseStatus": "VoiceLive.ResponseStatus",
         "azure.ai.voicelive.models.ResponseItemStatus": "VoiceLive.ResponseItemStatus",
+        "azure.ai.voicelive.models.RequestImageContentPartDetail": "VoiceLive.RequestImageContentPartDetail",
         "azure.ai.voicelive.models.ServerEventType": "VoiceLive.ServerEventType"
     }
 }
diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py
@@ -72,6 +72,7 @@
     OutputTextContentPart,
     OutputTokenDetails,
     RequestAudioContentPart,
+    RequestImageContentPart,
     RequestSession,
     RequestTextContentPart,
     Response,
@@ -126,6 +127,9 @@
     ServerEventResponseFunctionCallArgumentsDone,
     ServerEventResponseMcpCallArgumentsDelta,
     ServerEventResponseMcpCallArgumentsDone,
+    ServerEventResponseMcpCallCompleted,
+    ServerEventResponseMcpCallFailed,
+    ServerEventResponseMcpCallInProgress,
     ServerEventResponseOutputItemAdded,
     ServerEventResponseOutputItemDone,
     ServerEventResponseTextDelta,
@@ -151,6 +155,8 @@
 from ._enums import (  # type: ignore
     AnimationOutputType,
     AudioTimestampType,
+    AvatarConfigTypes,
+    AvatarOutputProtocol,
     AzureVoiceType,
     ClientEventType,
     ContentPartType,
@@ -164,6 +170,8 @@
     OpenAIVoiceName,
     OutputAudioFormat,
     PersonalVoiceModels,
+    PhotoAvatarBaseModes,
+    RequestImageContentPartDetail,
     ResponseItemStatus,
     ResponseStatus,
     ServerEventType,
@@ -234,6 +242,7 @@
     "OutputTextContentPart",
     "OutputTokenDetails",
     "RequestAudioContentPart",
+    "RequestImageContentPart",
     "RequestSession",
     "RequestTextContentPart",
     "Response",
@@ -288,6 +297,9 @@
     "ServerEventResponseFunctionCallArgumentsDone",
     "ServerEventResponseMcpCallArgumentsDelta",
     "ServerEventResponseMcpCallArgumentsDone",
+    "ServerEventResponseMcpCallCompleted",
+    "ServerEventResponseMcpCallFailed",
+    "ServerEventResponseMcpCallInProgress",
     "ServerEventResponseOutputItemAdded",
     "ServerEventResponseOutputItemDone",
     "ServerEventResponseTextDelta",
@@ -310,6 +322,8 @@
     "VoiceLiveErrorDetails",
     "AnimationOutputType",
     "AudioTimestampType",
+    "AvatarConfigTypes",
+    "AvatarOutputProtocol",
     "AzureVoiceType",
     "ClientEventType",
     "ContentPartType",
@@ -323,6 +337,8 @@
     "OpenAIVoiceName",
     "OutputAudioFormat",
     "PersonalVoiceModels",
+    "PhotoAvatarBaseModes",
+    "RequestImageContentPartDetail",
     "ResponseItemStatus",
     "ResponseStatus",
     "ServerEventType",
diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py
@@ -26,6 +26,24 @@ class AudioTimestampType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     """Timestamps per word in the output audio."""
 
 
+class AvatarConfigTypes(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """Avatar config types."""
+
+    VIDEO_AVATAR = "video-avatar"
+    """Video avatar"""
+    PHOTO_AVATAR = "photo-avatar"
+    """Photo avatar"""
+
+
+class AvatarOutputProtocol(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """Avatar config output protocols."""
+
+    WEBRTC = "webrtc"
+    """WebRTC protocol, output the audio/video streams via WebRTC"""
+    WEBSOCKET = "websocket"
+    """WebSocket protocol, output the video frames over WebSocket"""
+
+
 class AzureVoiceType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     """Union of all supported Azure voice types."""
 
@@ -64,6 +82,7 @@ class ContentPartType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
 
     INPUT_TEXT = "input_text"
     INPUT_AUDIO = "input_audio"
+    INPUT_IMAGE = "input_image"
     TEXT = "text"
     AUDIO = "audio"
 
@@ -162,6 +181,10 @@ class OpenAIVoiceName(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     """Shimmer voice."""
     VERSE = "verse"
     """Verse voice."""
+    MARIN = "marin"
+    """Marin voice."""
+    CEDAR = "cedar"
+    """Cedar voice."""
 
 
 class OutputAudioFormat(str, Enum, metaclass=CaseInsensitiveEnumMeta):
@@ -190,6 +213,24 @@ class PersonalVoiceModels(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     """Use the Phoenix V2 model."""
 
 
+class PhotoAvatarBaseModes(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """Photo avatar base modes."""
+
+    VASA1 = "vasa-1"
+    """VASA-1 model"""
+
+
+class RequestImageContentPartDetail(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """Specifies an image's detail level. Can be 'auto', 'low', 'high', or an unknown future value."""
+
+    AUTO = "auto"
+    """Automatically select an appropriate detail level."""
+    LOW = "low"
+    """Use a lower detail level to reduce bandwidth or cost."""
+    HIGH = "high"
+    """Use a higher detail level—potentially more resource-intensive."""
+
+
 class ResponseItemStatus(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     """Indicates the processing status of a response item."""
 
diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_models.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_models.py
diff --git a/sdk/ai/azure-ai-voicelive/tests/test_unit_models.py b/sdk/ai/azure-ai-voicelive/tests/test_unit_models.py

Original file line number	Diff line number	Diff line change
`@@ -2187,7 +2187,7 @@`
`2187`	`2187`	`},`
`2188`	`2188`	`{`
`2189`	`2189`	`"filename": "sdk/ai/azure-ai-voicelive/**",`
`2190`		`- "words": ["viseme","VISEME","ulaw","ULAW","logprobs","pyaudio","PyAudio","libasound"]`
	`2190`	`+ "words": ["viseme","VISEME","ulaw","ULAW","logprobs","pyaudio","PyAudio","libasound","webrtc","WEBRTC"]`
`2191`	`2191`	`}`
`2192`	`2192`	`],`
`2193`	`2193`	`"allowCompoundWords": true`