Skip to content

Commit 4fe8321

Browse files
committed
feat: Enhance TTS and LLM configurations with new parameters and improved structure for better flexibility
1 parent bb3a706 commit 4fe8321

File tree

4 files changed

+267
-27
lines changed

4 files changed

+267
-27
lines changed

examples/convoai/service/service.go

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,15 @@ func (s *Service) RunWithCustomTTS(ttsVendor req.TTSVendor, ttsParam req.TTSVend
126126
Asr: &req.JoinPropertiesAsrBody{
127127
Language: "zh-CN",
128128
},
129+
Parameters: &req.Parameters{
130+
FixedParams: &req.ParametersStructData{
131+
SilenceConfig: &req.SilenceConfig{
132+
TimeoutMs: agoraUtils.Ptr(1200),
133+
Action: agoraUtils.Ptr("speak"),
134+
Content: agoraUtils.Ptr("Hello, how can I help you?"),
135+
},
136+
},
137+
},
129138
})
130139
if err != nil {
131140
log.Fatalln(err)
@@ -216,6 +225,19 @@ func (s *Service) RunWithCustomTTS(ttsVendor req.TTSVendor, ttsParam req.TTSVend
216225

217226
updateResp, err := convoaiClient.Update(ctx, agentId, &req.UpdateReqBody{
218227
Token: updateToken,
228+
LLM: &req.UpdateLLMBody{
229+
SystemMessages: []map[string]any{
230+
{
231+
"role": "system",
232+
"content": "You are a helpful chatbot, and you are a new assistant.",
233+
},
234+
},
235+
Params: map[string]any{
236+
"model": llmModel,
237+
"max_tokens": 2048,
238+
"username": "Tom",
239+
},
240+
},
219241
})
220242
if err != nil {
221243
log.Fatalln(err)
@@ -350,14 +372,14 @@ func (s *Service) RunWithMinimaxTTS() {
350372
GroupId: ttsGroupId,
351373
Key: ttsGroupKey,
352374
Model: ttsGroupModel,
353-
VoiceSetting: req.TTSMinimaxVendorVoiceSettingParam{
375+
VoiceSetting: &req.TTSMinimaxVendorVoiceSettingParam{
354376
VoiceId: "female-shaonv",
355377
Speed: 1,
356378
Vol: 1,
357379
Pitch: 0,
358380
Emotion: "happy",
359381
},
360-
AudioSetting: req.TTSMinimaxVendorAudioSettingParam{
382+
AudioSetting: &req.TTSMinimaxVendorAudioSettingParam{
361383
SampleRate: 16000,
362384
},
363385
}
@@ -382,11 +404,12 @@ func (s *Service) RunWithMicrosoftTTS() {
382404
}
383405

384406
ttsParam := req.TTSMicrosoftVendorParams{
385-
Key: ttsKey,
386-
Region: ttsRegion,
387-
VoiceName: ttsVoiceName,
388-
Rate: 1.8,
389-
Volume: 70,
407+
Key: ttsKey,
408+
Region: ttsRegion,
409+
VoiceName: ttsVoiceName,
410+
Speed: 1.0,
411+
Volume: 70,
412+
SampleRate: 24000,
390413
}
391414

392415
s.RunWithCustomTTS(req.MicrosoftTTSVendor, ttsParam)
@@ -409,7 +432,7 @@ func (s *Service) RunWithElevenLabsTTS() {
409432
}
410433

411434
ttsParam := req.TTSElevenLabsVendorParams{
412-
APIKey: ttsApiKey,
435+
Key: ttsApiKey,
413436
ModelId: ttsModelId,
414437
VoiceId: ttsVoiceId,
415438
}

services/convoai/api/interrupt.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ func (i *Interrupt) buildPath(agentId string) string {
3636

3737
func (i *Interrupt) Do(ctx context.Context, agentId string) (*resp.InterruptResp, error) {
3838
path := i.buildPath(agentId)
39-
responseData, err := doRESTWithRetry(ctx, i.module, i.logger, i.retryCount, i.client, path, http.MethodPost, nil)
39+
responseData, err := doRESTWithRetry(ctx, i.module, i.logger, i.retryCount, i.client, path, http.MethodPost, map[string]any{})
4040
if err != nil {
4141
var internalErr *agora.InternalErr
4242
if !errors.As(err, &internalErr) {

services/convoai/req/join.go

Lines changed: 217 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
package req
22

3+
import (
4+
"encoding/json"
5+
)
6+
37
// @brief Defines advanced feature configurations for the agent to join the RTC channel
48
//
59
// @since v0.7.0
@@ -27,27 +31,40 @@ type TTSVendorParamsInterface interface {
2731
}
2832

2933
type TTSMinimaxVendorVoiceSettingParam struct {
30-
VoiceId string `json:"voice_id"`
31-
Speed float32 `json:"speed"`
32-
Vol float32 `json:"vol"`
33-
Pitch int `json:"pitch"`
34-
Emotion string `json:"emotion"`
34+
VoiceId string `json:"voice_id"`
35+
Speed float32 `json:"speed"`
36+
Vol float32 `json:"vol"`
37+
Pitch int `json:"pitch"`
38+
Emotion string `json:"emotion"`
39+
LatexRender bool `json:"latex_render"`
40+
EnglishNormalization bool `json:"english_normalization"`
3541
}
3642

3743
type TTSMinimaxVendorAudioSettingParam struct {
3844
SampleRate int `json:"sample_rate"`
3945
}
4046

47+
type PronunciationDictParam struct {
48+
Tone []string `json:"tone"`
49+
}
50+
51+
type TimberWeightsParam struct {
52+
VoiceId string `json:"voice_id"`
53+
Weight int `json:"weight"`
54+
}
55+
4156
// @brief Defines the Minimax vendor parameters for the Text-to-Speech (TTS) module when the agent joins the RTC channel, see
4257
// https://platform.minimaxi.com/document/T2A%20V2 for details
4358
//
4459
// @since v0.7.0
4560
type TTSMinimaxVendorParams struct {
46-
GroupId string `json:"group_id"`
47-
Key string `json:"key"`
48-
Model string `json:"model"`
49-
VoiceSetting TTSMinimaxVendorVoiceSettingParam `json:"voice_setting"`
50-
AudioSetting TTSMinimaxVendorAudioSettingParam `json:"audio_setting"`
61+
GroupId string `json:"group_id"`
62+
Key string `json:"key"`
63+
Model string `json:"model"`
64+
VoiceSetting *TTSMinimaxVendorVoiceSettingParam `json:"voice_setting,omitempty"`
65+
AudioSetting *TTSMinimaxVendorAudioSettingParam `json:"audio_setting,omitempty"`
66+
PronunciationDict *PronunciationDictParam `json:"pronunciation_dict,omitempty"`
67+
TimberWeights []TimberWeightsParam `json:"timber_weights,omitempty"`
5168
}
5269

5370
func (TTSMinimaxVendorParams) VendorParam() {}
@@ -87,19 +104,53 @@ type TTSBytedanceVendorParams struct {
87104
func (TTSBytedanceVendorParams) VendorParam() {}
88105

89106
type TTSMicrosoftVendorParams struct {
90-
Key string `json:"key"`
91-
Region string `json:"region"`
92-
VoiceName string `json:"voice_name"`
93-
Rate float32 `json:"rate"`
94-
Volume float32 `json:"volume"`
107+
// The API key used for authentication.(Required)
108+
Key string `json:"key"`
109+
// The Azure region where the speech service is hosted.(Required)
110+
Region string `json:"region"`
111+
// The identifier for the selected voice for speech synthesis.(Optional)
112+
VoiceName string `json:"voice_name"`
113+
// Indicates the speaking rate of the text.(Optional)
114+
//
115+
// The rate can be applied at the word or sentence level and should be between 0.5 and 2.0 times the original audio speed.
116+
Speed float32 `json:"speed"`
117+
// Specifies the audio volume as a number between 0.0 and 100.0, where 0.0 is the quietest and 100.0 is the loudest.
118+
//
119+
// For example, a value of 75 sets the volume to 75% of the maximum.
120+
//
121+
// The default value is100.
122+
Volume float32 `json:"volume"`
123+
// Specifies the audio sampling rate in Hz.(Optional)
124+
//
125+
// The default value is 24000.
126+
SampleRate int `json:"sample_rate"`
95127
}
96128

97129
func (TTSMicrosoftVendorParams) VendorParam() {}
98130

99131
type TTSElevenLabsVendorParams struct {
100-
APIKey string `json:"api_key"`
132+
// The API key used for authentication.(Required)
133+
Key string `json:"key"`
134+
// Identifier of the model to be used.(Required)
101135
ModelId string `json:"model_id"`
136+
// The identifier for the selected voice for speech synthesis.(Required)
102137
VoiceId string `json:"voice_id"`
138+
// Specifies the audio sampling rate in Hz.(Optional)
139+
//
140+
// The default value is 24000.
141+
SampleRate int `json:"sample_rate"`
142+
// The stability for voice settings.(Optional)
143+
Stability float32 `json:"stability"`
144+
// Determines how closely the AI should adhere to the original voice when attempting to replicate it.
145+
SimilarityBoost float32 `json:"similarity_boost"`
146+
// Determines the style exaggeration of the voice. This setting attempts to amplify the style of the original speaker.
147+
//
148+
// It does consume additional computational resources and might increase latency if set to anything other than 0.
149+
Style float32 `json:"style"`
150+
// This setting boosts the similarity to the original speaker.
151+
//
152+
// Using this setting requires a slightly higher computational load, which in turn increases latency.
153+
UseSpeakerBoost bool `json:"use_speaker_boost"`
103154
}
104155

105156
func (TTSElevenLabsVendorParams) VendorParam() {}
@@ -208,7 +259,35 @@ type JoinPropertiesCustomLLMBody struct {
208259
// and then recalculate the silence time.
209260
//
210261
// When silence_timeout is set to 0, this parameter is ignored.
262+
//
263+
// Deprecated: Use [Parameters.SilenceConfig] instead
264+
//
265+
// @deprecated This field is deprecated since v0.11.0
211266
SilenceMessage *string `json:"silence_message,omitempty"`
267+
// LLM provider(Optional), supports the following settings:
268+
//
269+
// - "custom": Custom LLM provider.
270+
// When you set this option, the agent includes the following fields, in addition to role and content when making requests to the custom LLM:
271+
// - turn_id: A unique identifier for each conversation turn. It starts from 0 and increments with each turn. One user-agent interaction corresponds to one turn_id.
272+
// - timestamp: The request timestamp, in milliseconds.
273+
// - "aliyun": Aliyun LLM provider.(Only available in China Mainland service region)
274+
//
275+
// - "bytedance": Bytedance LLM provider.(Only available in China Mainland service region)
276+
//
277+
// - "deepseek": DeepSeek LLM provider.(Only available in China Mainland service region)
278+
//
279+
// - "tencent": Tencent LLM provider.(Only available in China Mainland service region)
280+
//
281+
Vendor string `json:"vendor,omitempty"`
282+
283+
// The request style for chat completion.(Optional)(Only available in global service region)
284+
//
285+
// - "openai": OpenAI style.(Default)
286+
//
287+
// - "gemini": Gemini style.
288+
//
289+
// - "anthropic": Anthropic style.
290+
Style string `json:"style,omitempty"`
212291
}
213292

214293
// @brief Defines the Voice Activity Detection (VAD) configuration for the agent to join the RTC channel
@@ -289,11 +368,19 @@ type JoinPropertiesReqBody struct {
289368
// - 0 (default): Do not enable this feature.
290369
//
291370
// - (0,60]: Must also set llm.silence_message to enable the feature.
371+
//
372+
// Deprecated: Use [Parameters.SilenceConfig] instead
373+
//
374+
// @deprecated This field is deprecated since v0.11.0
292375
SilenceTimeout *int `json:"silence_timeout,omitempty"`
293376

294377
// Agent user ID in the RTM channel
295378
//
296379
// Only valid when advanced_features.enable_rtm is true
380+
//
381+
// Deprecated: Use AgentRtcUId instead
382+
//
383+
// @deprecated This field is deprecated since v0.11.0
297384
AgentRtmUId *string `json:"agent_rtm_uid,omitempty"`
298385
// Advanced feature configurations (optional), see JoinPropertiesAdvancedFeaturesBody for details
299386
AdvancedFeatures *JoinPropertiesAdvancedFeaturesBody `json:"advanced_features,omitempty"`
@@ -304,6 +391,118 @@ type JoinPropertiesReqBody struct {
304391
// Voice Activity Detection (VAD) configuration (optional), see JoinPropertiesVadBody for details
305392
Vad *JoinPropertiesVadBody `json:"vad,omitempty"`
306393
// Automatic Speech Recognition (ASR) configuration (optional), see JoinPropertiesAsrBody for details
307-
Asr *JoinPropertiesAsrBody `json:"asr,omitempty"`
308-
Parameters map[string]any `json:"parameters,omitempty"`
394+
Asr *JoinPropertiesAsrBody `json:"asr,omitempty"`
395+
// Conversation turn detection settings
396+
TurnDetection *TurnDetectionBody `json:"turn_detection,omitempty"`
397+
// Agent parameters configuration (optional), see Parameters for details
398+
Parameters *Parameters `json:"parameters,omitempty"`
399+
}
400+
401+
// @brief Conversation turn detection settings
402+
//
403+
// @since v0.11.0
404+
type TurnDetectionBody struct {
405+
// When the agent is interacting (speaking or thinking), the mode of human voice interrupting the agent's behavior, support the following values:
406+
//
407+
// - "interrupt"(Default): Interrupt mode, human voice immediately interrupts the agent's interaction.
408+
// The agent will terminate the current interaction and directly process the human voice input.
409+
//
410+
// - "append": Append mode, human voice does not interrupt the agent. (Default)
411+
// The agent will process the human voice request after the current interaction ends.
412+
//
413+
// - "ignore": Ignore mode, the agent ignores the human voice request.
414+
// If the agent is speaking or thinking and receives human voice during the process,
415+
// the agent will directly ignore and discard the human voice request, not storing it in the context.
416+
InterruptMode string `json:"interrupt_mode,omitempty"`
417+
}
418+
419+
// @brief Structured data for parameters
420+
//
421+
// @since v0.11.0
422+
type ParametersStructData struct {
423+
// Silence configuration for the agent
424+
SilenceConfig *SilenceConfig `json:"silence_config,omitempty"`
425+
}
426+
427+
// @brief Silence configuration for the agent
428+
//
429+
// @since v0.11.0
430+
type SilenceConfig struct {
431+
// Agent maximum silence time (ms).(Optional)
432+
//
433+
// After the agent is created and a user joins the channel,
434+
// the duration of the agent's non-listening, thinking, or speaking state is called the agent's silence time.
435+
//
436+
// When the silence time reaches the set value, the agent will report the silence prompt message.
437+
//
438+
// This feature can be used to let the agent remind users when users are inactive.
439+
//
440+
// Set 0: Do not enable this feature.
441+
//
442+
// Set to (0,60000]: Must also set content to enable normal reporting of silence prompts, otherwise the setting is invalid.
443+
TimeoutMs *int `json:"timeout_ms,omitempty"`
444+
445+
// When the silence time reaches the set value, the agent will take the following actions(Optional):
446+
//
447+
// - "speak": Use TTS module to report the silence message (Default)
448+
//
449+
// - "think": Append the silence message to the end of the context and pass it to LLM
450+
Action *string `json:"action,omitempty"`
451+
452+
// Content of the silence message (Optional)
453+
//
454+
// The content will be used in different ways according to the settings in the action.
455+
Content *string `json:"content,omitempty"`
456+
}
457+
458+
// @brief Agent parameters configuration
459+
//
460+
// @note Parameters that contains both extra data and fixed data. The same key in extra data and fixed data will be merged.
461+
//
462+
// @since v0.11.0
463+
type Parameters struct {
464+
// Extra parameters for flexible key-value pairs
465+
ExtraParams map[string]any `json:"-"`
466+
// Fixed parameters for type-safe parameters
467+
FixedParams *ParametersStructData `json:"-"`
468+
}
469+
470+
// MarshalJSON implements custom JSON marshaling
471+
func (p *Parameters) MarshalJSON() ([]byte, error) {
472+
// Create a map to hold the merged data
473+
merged := make(map[string]any)
474+
475+
// Add fixed parameters if present
476+
if p.FixedParams != nil {
477+
structBytes, err := json.Marshal(p.FixedParams)
478+
if err != nil {
479+
return nil, err
480+
}
481+
var structMap map[string]any
482+
if err := json.Unmarshal(structBytes, &structMap); err != nil {
483+
return nil, err
484+
}
485+
for k, v := range structMap {
486+
merged[k] = v
487+
}
488+
}
489+
490+
// Add extra parameters if present
491+
if p.ExtraParams != nil {
492+
for k, v := range p.ExtraParams {
493+
merged[k] = v
494+
}
495+
}
496+
497+
return json.Marshal(merged)
498+
}
499+
500+
// UnmarshalJSON implements custom JSON unmarshaling
501+
func (p *Parameters) UnmarshalJSON(data []byte) error {
502+
var mapData map[string]any
503+
if err := json.Unmarshal(data, &mapData); err != nil {
504+
return err
505+
}
506+
p.ExtraParams = mapData
507+
return nil
309508
}

0 commit comments

Comments
 (0)