[Cartesia] Adding new model literals, updating default to sonic-2 (#1626)

chongzluong · web-flow · commit c372fa569bc3 · 2025-03-11T09:42:57.000+08:00
diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/models.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/models.py
@@ -8,7 +8,7 @@
     # "pcm_alaw",
 ]
 
-TTSModels = Literal["sonic-english", "sonic-multilingual"]
+TTSModels = Literal["sonic", "sonic-2", "sonic-lite", "sonic-preview", "sonic-turbo"]
 TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
 TTSDefaultVoiceId = "794f9389-aac1-45b6-b726-9d9369183238"
 TTSVoiceSpeed = Literal["fastest", "fast", "normal", "slow", "slowest"]
diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py
@@ -73,7 +73,7 @@ class TTS(tts.TTS):
     def __init__(
         self,
         *,
-        model: TTSModels | str = "sonic",
+        model: TTSModels | str = "sonic-2",
         language: str = "en",
         encoding: TTSEncoding = "pcm_s16le",
         voice: str | list[float] = TTSDefaultVoiceId,
@@ -90,7 +90,7 @@ def __init__(
         See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.
 
         Args:
-            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
+            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-2".
             language (str, optional): The language code for synthesis. Defaults to "en".
             encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
             voice (str | list[float], optional): The voice ID or embedding array.
@@ -169,7 +169,7 @@ def update_options(
         and emotion. If any parameter is not provided, the existing value will be retained.
 
         Args:
-            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
+            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-2".
             language (str, optional): The language code for synthesis. Defaults to "en".
             voice (str | list[float], optional): The voice ID or embedding array.
             speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`# "pcm_alaw",`
`9`	`9`	`]`
`10`	`10`
`11`		`-TTSModels = Literal["sonic-english", "sonic-multilingual"]`
	`11`	`+TTSModels = Literal["sonic", "sonic-2", "sonic-lite", "sonic-preview", "sonic-turbo"]`
`12`	`12`	`TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]`
`13`	`13`	`TTSDefaultVoiceId = "794f9389-aac1-45b6-b726-9d9369183238"`
`14`	`14`	`TTSVoiceSpeed = Literal["fastest", "fast", "normal", "slow", "slowest"]`