Skip to content

Commit 154dbe5

Browse files
xitzhangXiting Zhang
andauthored
[VoiceLive] Fix MCP error (Azure#47325)
* [VoiceLive] Fix MCP error * update code owner * update spell words * update format --------- Co-authored-by: Xiting Zhang <[email protected]>
1 parent ba9b59c commit 154dbe5

31 files changed

+1882
-44
lines changed

.github/CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@
9292
/sdk/ai/azure-ai-inference/ @dargilco @jhakulin @glharper @Azure/azure-java-sdk
9393

9494
# PRLabel: %Voice Live
95-
/sdk/ai/azure-ai-voicelive/ @rhurey @xitzhang
95+
/sdk/ai/azure-ai-voicelive/ @rhurey @xitzhang @amber-yujueWang
9696

9797
# ServiceLabel: %AKS
9898
# ServiceOwners: @Azure/aks-pm

.vscode/cspell.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -873,7 +873,9 @@
873873
"words": [
874874
"Dexec",
875875
"viseme",
876-
"VISEME"
876+
"VISEME",
877+
"webrtc",
878+
"WEBRTC"
877879
]
878880
},
879881
{

sdk/ai/azure-ai-voicelive/CHANGELOG.md

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,27 @@
44

55
### Features Added
66

7-
### Breaking Changes
8-
9-
### Bugs Fixed
10-
11-
### Other Changes
7+
- Added image input support for multimodal conversations:
8+
- `RequestImageContentPart` for including images in conversation messages with URL references
9+
- `RequestImageContentPartDetail` enum for controlling image detail level (auto, low, high)
10+
- `ContentPartType.INPUT_IMAGE` discriminator for image content parts
11+
- Added avatar configuration enhancements:
12+
- `AvatarConfiguration` class for configuring avatar streaming and behavior with ICE servers, character selection, style, and video parameters
13+
- `AvatarConfigTypes` enum for video and photo avatar types
14+
- `AvatarOutputProtocol` enum supporting WebRTC and WebSocket protocols
15+
- `PhotoAvatarBaseModes` enum with VASA-1 model support
16+
- Added token usage tracking improvements:
17+
- `CachedTokenDetails` for tracking cached text, audio, and image tokens
18+
- Enhanced `InputTokenDetails` with image token tracking and cached token details
19+
- Added MCP call lifecycle events:
20+
- `ServerEventResponseMcpCallInProgress` for tracking ongoing MCP calls
21+
- `ServerEventResponseMcpCallCompleted` for successful MCP call completion
22+
- `ServerEventResponseMcpCallFailed` for failed MCP calls
23+
- Added two new OpenAI voices: `OpenAIVoiceName.MARIN` and `OpenAIVoiceName.CEDAR`
24+
- Enhanced `AzurePersonalVoice` with additional customization options:
25+
- Custom lexicon URL support for pronunciation customization
26+
- Locale preferences with `preferLocales` for multilingual scenarios
27+
- Voice style, pitch, rate, and volume controls for fine-tuned voice characteristics
1228

1329
## 1.0.0-beta.2 (2025-11-14)
1430

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
// Code generated by Microsoft (R) TypeSpec Code Generator.
4+
package com.azure.ai.voicelive.models;
5+
6+
import com.azure.core.annotation.Generated;
7+
import com.azure.core.util.ExpandableStringEnum;
8+
import java.util.Collection;
9+
10+
/**
11+
* Avatar config types.
12+
*/
13+
public final class AvatarConfigTypes extends ExpandableStringEnum<AvatarConfigTypes> {
14+
15+
/**
16+
* Video avatar.
17+
*/
18+
@Generated
19+
public static final AvatarConfigTypes VIDEO_AVATAR = fromString("video-avatar");
20+
21+
/**
22+
* Photo avatar.
23+
*/
24+
@Generated
25+
public static final AvatarConfigTypes PHOTO_AVATAR = fromString("photo-avatar");
26+
27+
/**
28+
* Creates a new instance of AvatarConfigTypes value.
29+
*
30+
* @deprecated Use the {@link #fromString(String)} factory method.
31+
*/
32+
@Generated
33+
@Deprecated
34+
public AvatarConfigTypes() {
35+
}
36+
37+
/**
38+
* Creates or finds a AvatarConfigTypes from its string representation.
39+
*
40+
* @param name a name to look for.
41+
* @return the corresponding AvatarConfigTypes.
42+
*/
43+
@Generated
44+
public static AvatarConfigTypes fromString(String name) {
45+
return fromString(name, AvatarConfigTypes.class);
46+
}
47+
48+
/**
49+
* Gets known AvatarConfigTypes values.
50+
*
51+
* @return known AvatarConfigTypes values.
52+
*/
53+
@Generated
54+
public static Collection<AvatarConfigTypes> values() {
55+
return values(AvatarConfigTypes.class);
56+
}
57+
}

sdk/ai/azure-ai-voicelive/src/main/java/com/azure/ai/voicelive/models/AvatarConfiguration.java

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,13 @@ public JsonWriter toJson(JsonWriter jsonWriter) throws IOException {
155155
jsonWriter.writeStartObject();
156156
jsonWriter.writeStringField("character", this.character);
157157
jsonWriter.writeBooleanField("customized", this.customized);
158+
jsonWriter.writeStringField("type", this.type == null ? null : this.type.toString());
158159
jsonWriter.writeArrayField("ice_servers", this.iceServers, (writer, element) -> writer.writeJson(element));
159160
jsonWriter.writeStringField("style", this.style);
161+
jsonWriter.writeStringField("model", this.model == null ? null : this.model.toString());
160162
jsonWriter.writeJsonField("video", this.video);
163+
jsonWriter.writeStringField("output_protocol",
164+
this.outputProtocol == null ? null : this.outputProtocol.toString());
161165
return jsonWriter.writeEndObject();
162166
}
163167

@@ -175,31 +179,127 @@ public static AvatarConfiguration fromJson(JsonReader jsonReader) throws IOExcep
175179
return jsonReader.readObject(reader -> {
176180
String character = null;
177181
boolean customized = false;
182+
AvatarConfigTypes type = null;
178183
List<IceServer> iceServers = null;
179184
String style = null;
185+
PhotoAvatarBaseModes model = null;
180186
VideoParams video = null;
187+
AvatarOutputProtocol outputProtocol = null;
181188
while (reader.nextToken() != JsonToken.END_OBJECT) {
182189
String fieldName = reader.getFieldName();
183190
reader.nextToken();
184191
if ("character".equals(fieldName)) {
185192
character = reader.getString();
186193
} else if ("customized".equals(fieldName)) {
187194
customized = reader.getBoolean();
195+
} else if ("type".equals(fieldName)) {
196+
type = AvatarConfigTypes.fromString(reader.getString());
188197
} else if ("ice_servers".equals(fieldName)) {
189198
iceServers = reader.readArray(reader1 -> IceServer.fromJson(reader1));
190199
} else if ("style".equals(fieldName)) {
191200
style = reader.getString();
201+
} else if ("model".equals(fieldName)) {
202+
model = PhotoAvatarBaseModes.fromString(reader.getString());
192203
} else if ("video".equals(fieldName)) {
193204
video = VideoParams.fromJson(reader);
205+
} else if ("output_protocol".equals(fieldName)) {
206+
outputProtocol = AvatarOutputProtocol.fromString(reader.getString());
194207
} else {
195208
reader.skipChildren();
196209
}
197210
}
198211
AvatarConfiguration deserializedAvatarConfiguration = new AvatarConfiguration(character, customized);
212+
deserializedAvatarConfiguration.type = type;
199213
deserializedAvatarConfiguration.iceServers = iceServers;
200214
deserializedAvatarConfiguration.style = style;
215+
deserializedAvatarConfiguration.model = model;
201216
deserializedAvatarConfiguration.video = video;
217+
deserializedAvatarConfiguration.outputProtocol = outputProtocol;
202218
return deserializedAvatarConfiguration;
203219
});
204220
}
221+
222+
/*
223+
* Type of avatar to use.
224+
*/
225+
@Generated
226+
private AvatarConfigTypes type;
227+
228+
/*
229+
* Base model to use for the avatar. Required for photo avatar.
230+
*/
231+
@Generated
232+
private PhotoAvatarBaseModes model;
233+
234+
/*
235+
* Output protocol for avatar streaming. Default is 'webrtc'.
236+
*/
237+
@Generated
238+
private AvatarOutputProtocol outputProtocol;
239+
240+
/**
241+
* Get the type property: Type of avatar to use.
242+
*
243+
* @return the type value.
244+
*/
245+
@Generated
246+
public AvatarConfigTypes getType() {
247+
return this.type;
248+
}
249+
250+
/**
251+
* Set the type property: Type of avatar to use.
252+
*
253+
* @param type the type value to set.
254+
* @return the AvatarConfiguration object itself.
255+
*/
256+
@Generated
257+
public AvatarConfiguration setType(AvatarConfigTypes type) {
258+
this.type = type;
259+
return this;
260+
}
261+
262+
/**
263+
* Get the model property: Base model to use for the avatar. Required for photo avatar.
264+
*
265+
* @return the model value.
266+
*/
267+
@Generated
268+
public PhotoAvatarBaseModes getModel() {
269+
return this.model;
270+
}
271+
272+
/**
273+
* Set the model property: Base model to use for the avatar. Required for photo avatar.
274+
*
275+
* @param model the model value to set.
276+
* @return the AvatarConfiguration object itself.
277+
*/
278+
@Generated
279+
public AvatarConfiguration setModel(PhotoAvatarBaseModes model) {
280+
this.model = model;
281+
return this;
282+
}
283+
284+
/**
285+
* Get the outputProtocol property: Output protocol for avatar streaming. Default is 'webrtc'.
286+
*
287+
* @return the outputProtocol value.
288+
*/
289+
@Generated
290+
public AvatarOutputProtocol getOutputProtocol() {
291+
return this.outputProtocol;
292+
}
293+
294+
/**
295+
* Set the outputProtocol property: Output protocol for avatar streaming. Default is 'webrtc'.
296+
*
297+
* @param outputProtocol the outputProtocol value to set.
298+
* @return the AvatarConfiguration object itself.
299+
*/
300+
@Generated
301+
public AvatarConfiguration setOutputProtocol(AvatarOutputProtocol outputProtocol) {
302+
this.outputProtocol = outputProtocol;
303+
return this;
304+
}
205305
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
// Code generated by Microsoft (R) TypeSpec Code Generator.
4+
package com.azure.ai.voicelive.models;
5+
6+
import com.azure.core.annotation.Generated;
7+
import com.azure.core.util.ExpandableStringEnum;
8+
import java.util.Collection;
9+
10+
/**
11+
* Avatar config output protocols.
12+
*/
13+
public final class AvatarOutputProtocol extends ExpandableStringEnum<AvatarOutputProtocol> {
14+
15+
/**
16+
* WebRTC protocol, output the audio/video streams via WebRTC.
17+
*/
18+
@Generated
19+
public static final AvatarOutputProtocol WEBRTC = fromString("webrtc");
20+
21+
/**
22+
* WebSocket protocol, output the video frames over WebSocket.
23+
*/
24+
@Generated
25+
public static final AvatarOutputProtocol WEBSOCKET = fromString("websocket");
26+
27+
/**
28+
* Creates a new instance of AvatarOutputProtocol value.
29+
*
30+
* @deprecated Use the {@link #fromString(String)} factory method.
31+
*/
32+
@Generated
33+
@Deprecated
34+
public AvatarOutputProtocol() {
35+
}
36+
37+
/**
38+
* Creates or finds a AvatarOutputProtocol from its string representation.
39+
*
40+
* @param name a name to look for.
41+
* @return the corresponding AvatarOutputProtocol.
42+
*/
43+
@Generated
44+
public static AvatarOutputProtocol fromString(String name) {
45+
return fromString(name, AvatarOutputProtocol.class);
46+
}
47+
48+
/**
49+
* Gets known AvatarOutputProtocol values.
50+
*
51+
* @return known AvatarOutputProtocol values.
52+
*/
53+
@Generated
54+
public static Collection<AvatarOutputProtocol> values() {
55+
return values(AvatarOutputProtocol.class);
56+
}
57+
}

0 commit comments

Comments
 (0)