fix (mistral-ai): add flexibility for timestamps (#4404)

tinalenguyen · web-flow · commit 620ed3485aa1 · 2025-12-27T17:23:37.000-05:00
diff --git a/livekit-plugins/livekit-plugins-mistralai/livekit/plugins/mistralai/stt.py b/livekit-plugins/livekit-plugins-mistralai/livekit/plugins/mistralai/stt.py
@@ -40,14 +40,14 @@
 @dataclass
 class _STTOptions:
     model: STTModels | str
-    language: str
+    language: str | None
 
 
 class STT(stt.STT):
     def __init__(
         self,
         *,
-        language: str = "en",
+        language: str | None = "en",
         model: STTModels | str = "voxtral-mini-latest",
         api_key: NotGivenOr[str] = NOT_GIVEN,
         client: Mistral | None = None,
@@ -56,7 +56,7 @@ def __init__(
         Create a new instance of MistralAI STT.
 
         Args:
-            language: The language code to use for transcription (e.g., "en" for English).
+            language: The language code to use for transcription (e.g., "en" for English). Segment timestamps will only be available if set to None.
             model: The MistralAI model to use for transcription, default is voxtral-mini-latest.
             api_key: Your MistralAI API key. If not provided, will use the MISTRAL_API_KEY environment variable.
             client: Optional pre-configured MistralAI client instance.
@@ -66,7 +66,6 @@ def __init__(
             capabilities=stt.STTCapabilities(
                 streaming=False,
                 interim_results=False,
-                # timestamp granularity doesn't seem to work
                 aligned_transcript=False,
             )
         )
@@ -123,16 +122,15 @@ async def _recognize_impl(
                 model=self._opts.model,
                 file={"content": data, "file_name": "audio.wav"},
                 language=self._opts.language if self._opts.language else None,
-                # for some reason, it doesn't return any segments even if we ask for them
-                timestamp_granularities=["segment"],
+                timestamp_granularities=None if self._opts.language else ["segment"],
             )
 
             return stt.SpeechEvent(
                 type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                 alternatives=[
                     stt.SpeechData(
                         text=resp.text,
-                        language=self._opts.language,
+                        language=self._opts.language if self._opts.language else "",
                         start_time=resp.segments[0].start if resp.segments else 0,
                         end_time=resp.segments[-1].end if resp.segments else 0,
                         words=[