fix (gemini streaming tts): change default audio encoding and voice (#4393)

tinalenguyen · web-flow · commit e9ac896edaeb · 2025-12-27T16:44:13.000-05:00
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/models.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/models.py
@@ -207,3 +207,7 @@
     "gemini-2.0-pro-exp-02-05",
     "gemini-1.5-pro",
 ]
+
+GeminiTTSModels = Literal[
+    "gemini-2.5-flash-tts", "gemini-2.5-flash-lite-preview-tts", "gemini-2.5-pro-tts"
+]
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/tts.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/tts.py
@@ -32,10 +32,9 @@
 from livekit.agents.utils import is_given
 
 from .log import logger
-from .models import Gender, SpeechLanguages
+from .models import GeminiTTSModels, Gender, SpeechLanguages
 
 NUM_CHANNELS = 1
-DEFAULT_VOICE_NAME = "en-US-Chirp3-HD-Charon"
 DEFAULT_LANGUAGE = "en-US"
 DEFAULT_GENDER = "neutral"
 
@@ -65,15 +64,15 @@ def __init__(
         gender: NotGivenOr[Gender | str] = NOT_GIVEN,
         voice_name: NotGivenOr[str] = NOT_GIVEN,
         voice_cloning_key: NotGivenOr[str] = NOT_GIVEN,
-        model_name: NotGivenOr[str] = NOT_GIVEN,
+        model_name: GeminiTTSModels | str = "gemini-2.5-flash-tts",
         prompt: NotGivenOr[str] = NOT_GIVEN,
         sample_rate: int = 24000,
         pitch: int = 0,
         effects_profile_id: str = "",
         speaking_rate: float = 1.0,
         volume_gain_db: float = 0.0,
         location: str = "global",
-        audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.OGG_OPUS,  # type: ignore
+        audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.PCM,  # type: ignore
         credentials_info: NotGivenOr[dict] = NOT_GIVEN,
         credentials_file: NotGivenOr[str] = NOT_GIVEN,
         tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
@@ -94,7 +93,7 @@ def __init__(
             gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
             voice_name (str, optional): Specific voice name. Default is an empty string. See https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#voice_options for supported voice in Gemini TTS models.
             voice_cloning_key (str, optional): Voice clone key. Created via https://cloud.google.com/text-to-speech/docs/chirp3-instant-custom-voice
-            model_name (str, optional): Model name for TTS (e.g., "gemini-2.5-flash-tts"). Enables Gemini TTS models with streaming support.
+            model_name (GeminiTTSModels | str, optional): Model name for TTS (e.g., "gemini-2.5-flash-tts", "chirp_3"). Default is "gemini-2.5-flash-tts".
             prompt (str, optional): Style prompt for Gemini TTS models. Controls tone, style, and speaking characteristics. Only applied to first input chunk in streaming mode.
             sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
             location (str, optional): Location for the TTS client. Default is "global".
@@ -134,14 +133,20 @@ def __init__(
             language_code=lang,
             ssml_gender=ssml_gender,
         )
-        if is_given(model_name):
+        if model_name != "chirp_3":  #  voice_params.model_name must not be set for Chirp 3
             voice_params.model_name = model_name
+
         if is_given(voice_cloning_key):
             voice_params.voice_clone = texttospeech.VoiceCloneParams(
                 voice_cloning_key=voice_cloning_key,
             )
         else:
-            voice_params.name = voice_name if is_given(voice_name) else DEFAULT_VOICE_NAME
+            if is_given(voice_name):
+                voice_params.name = voice_name
+            elif model_name == "chirp_3":
+                voice_params.name = "en-US-Chirp3-HD-Charon"
+            else:
+                voice_params.name = "Charon"
 
         if not is_given(tokenizer):
             tokenizer = tokenize.blingfire.SentenceTokenizer()
@@ -160,7 +165,7 @@ def __init__(
             custom_pronunciations=pronunciations,
             enable_ssml=enable_ssml,
             use_markup=use_markup,
-            model_name=model_name if is_given(model_name) else None,
+            model_name=model_name,
             prompt=prompt if is_given(prompt) else None,
         )
         self._streams = weakref.WeakSet[SynthesizeStream]()

Original file line number	Diff line number	Diff line change
`@@ -207,3 +207,7 @@`
`207`	`207`	`"gemini-2.0-pro-exp-02-05",`
`208`	`208`	`"gemini-1.5-pro",`
`209`	`209`	`]`
	`210`	`+`
	`211`	`+GeminiTTSModels = Literal[`
	`212`	`+ "gemini-2.5-flash-tts", "gemini-2.5-flash-lite-preview-tts", "gemini-2.5-pro-tts"`
	`213`	`+]`