Skip to content

Commit e9ac896

Browse files
authored
fix (gemini streaming tts): change default audio encoding and voice (#4393)
1 parent c59b482 commit e9ac896

File tree

2 files changed

+17
-8
lines changed
  • livekit-plugins/livekit-plugins-google/livekit/plugins/google

2 files changed

+17
-8
lines changed

livekit-plugins/livekit-plugins-google/livekit/plugins/google/models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,7 @@
207207
"gemini-2.0-pro-exp-02-05",
208208
"gemini-1.5-pro",
209209
]
210+
211+
GeminiTTSModels = Literal[
212+
"gemini-2.5-flash-tts", "gemini-2.5-flash-lite-preview-tts", "gemini-2.5-pro-tts"
213+
]

livekit-plugins/livekit-plugins-google/livekit/plugins/google/tts.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,9 @@
3232
from livekit.agents.utils import is_given
3333

3434
from .log import logger
35-
from .models import Gender, SpeechLanguages
35+
from .models import GeminiTTSModels, Gender, SpeechLanguages
3636

3737
NUM_CHANNELS = 1
38-
DEFAULT_VOICE_NAME = "en-US-Chirp3-HD-Charon"
3938
DEFAULT_LANGUAGE = "en-US"
4039
DEFAULT_GENDER = "neutral"
4140

@@ -65,15 +64,15 @@ def __init__(
6564
gender: NotGivenOr[Gender | str] = NOT_GIVEN,
6665
voice_name: NotGivenOr[str] = NOT_GIVEN,
6766
voice_cloning_key: NotGivenOr[str] = NOT_GIVEN,
68-
model_name: NotGivenOr[str] = NOT_GIVEN,
67+
model_name: GeminiTTSModels | str = "gemini-2.5-flash-tts",
6968
prompt: NotGivenOr[str] = NOT_GIVEN,
7069
sample_rate: int = 24000,
7170
pitch: int = 0,
7271
effects_profile_id: str = "",
7372
speaking_rate: float = 1.0,
7473
volume_gain_db: float = 0.0,
7574
location: str = "global",
76-
audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.OGG_OPUS, # type: ignore
75+
audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.PCM, # type: ignore
7776
credentials_info: NotGivenOr[dict] = NOT_GIVEN,
7877
credentials_file: NotGivenOr[str] = NOT_GIVEN,
7978
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
@@ -94,7 +93,7 @@ def __init__(
9493
gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
9594
voice_name (str, optional): Specific voice name. Default is an empty string. See https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#voice_options for supported voice in Gemini TTS models.
9695
voice_cloning_key (str, optional): Voice clone key. Created via https://cloud.google.com/text-to-speech/docs/chirp3-instant-custom-voice
97-
model_name (str, optional): Model name for TTS (e.g., "gemini-2.5-flash-tts"). Enables Gemini TTS models with streaming support.
96+
model_name (GeminiTTSModels | str, optional): Model name for TTS (e.g., "gemini-2.5-flash-tts", "chirp_3"). Default is "gemini-2.5-flash-tts".
9897
prompt (str, optional): Style prompt for Gemini TTS models. Controls tone, style, and speaking characteristics. Only applied to first input chunk in streaming mode.
9998
sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
10099
location (str, optional): Location for the TTS client. Default is "global".
@@ -134,14 +133,20 @@ def __init__(
134133
language_code=lang,
135134
ssml_gender=ssml_gender,
136135
)
137-
if is_given(model_name):
136+
if model_name != "chirp_3": # voice_params.model_name must not be set for Chirp 3
138137
voice_params.model_name = model_name
138+
139139
if is_given(voice_cloning_key):
140140
voice_params.voice_clone = texttospeech.VoiceCloneParams(
141141
voice_cloning_key=voice_cloning_key,
142142
)
143143
else:
144-
voice_params.name = voice_name if is_given(voice_name) else DEFAULT_VOICE_NAME
144+
if is_given(voice_name):
145+
voice_params.name = voice_name
146+
elif model_name == "chirp_3":
147+
voice_params.name = "en-US-Chirp3-HD-Charon"
148+
else:
149+
voice_params.name = "Charon"
145150

146151
if not is_given(tokenizer):
147152
tokenizer = tokenize.blingfire.SentenceTokenizer()
@@ -160,7 +165,7 @@ def __init__(
160165
custom_pronunciations=pronunciations,
161166
enable_ssml=enable_ssml,
162167
use_markup=use_markup,
163-
model_name=model_name if is_given(model_name) else None,
168+
model_name=model_name,
164169
prompt=prompt if is_given(prompt) else None,
165170
)
166171
self._streams = weakref.WeakSet[SynthesizeStream]()

0 commit comments

Comments
 (0)