multilingual turn detector (#1736)

jeradf · davidzhao · web-flow · commit 476271b91c30 · 2025-04-05T20:04:26.000-05:00
Co-authored-by: David Zhao &lt;dz@livekit.io&gt;
diff --git a/.changeset/many-emus-allow.md b/.changeset/many-emus-allow.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-turn-detector": patch
+---
+
+added a multilingual turn detector option
diff --git a/.changeset/yellow-ways-dance.md b/.changeset/yellow-ways-dance.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-deepgram": patch
+---
+
+support multilingual with Nova-3 model
diff --git a/examples/voice-pipeline-agent/turn_detector.py b/examples/voice-pipeline-agent/turn_detector.py
@@ -11,7 +11,8 @@
     metrics,
 )
 from livekit.agents.pipeline import VoicePipelineAgent
-from livekit.plugins import deepgram, openai, silero, turn_detector
+from livekit.plugins import deepgram, openai, silero
+from livekit.plugins.turn_detector.multilingual import MultilingualModel
 
 load_dotenv()
 logger = logging.getLogger("voice-assistant")
@@ -42,11 +43,11 @@ async def entrypoint(ctx: JobContext):
 
     agent = VoicePipelineAgent(
         vad=ctx.proc.userdata["vad"],
-        stt=deepgram.STT(),
+        stt=deepgram.STT(model="nova-3", language="multi"),
         llm=openai.LLM(model="gpt-4o-mini"),
         tts=openai.TTS(),
         chat_ctx=initial_ctx,
-        turn_detector=turn_detector.EOUModel(),
+        turn_detector=MultilingualModel(),
     )
 
     agent.start(ctx.room, participant)
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -164,7 +164,7 @@ class AgentTranscriptionOptions:
 class _TurnDetector(Protocol):
     # When endpoint probability is below this threshold we think the user is not finished speaking
     # so we will use a long delay
-    def unlikely_threshold(self) -> float: ...
+    def unlikely_threshold(self, language: str | None) -> float: ...
     def supports_language(self, language: str | None) -> bool: ...
     async def predict_end_of_turn(self, chat_ctx: ChatContext) -> float: ...
 
@@ -1314,6 +1314,10 @@ def _compute_delay(self) -> float | None:
 
     def on_human_final_transcript(self, transcript: str, language: str | None) -> None:
         self._last_final_transcript += " " + transcript.strip()  # type: ignore
+        logger.debug(
+            "last language updated",
+            extra={"from": self._last_language, "to": language},
+        )
         self._last_language = language
         self._last_recv_transcript_time = time.perf_counter()
 
@@ -1355,21 +1359,28 @@ def _run(self, delay: float) -> None:
         @utils.log_exceptions(logger=logger)
         async def _run_task(chat_ctx: ChatContext, delay: float) -> None:
             use_turn_detector = self._last_final_transcript and not self._speaking
-            if (
-                use_turn_detector
-                and self._turn_detector is not None
-                and self._turn_detector.supports_language(self._last_language)
-            ):
-                start_time = time.perf_counter()
-                try:
-                    eot_prob = await self._turn_detector.predict_end_of_turn(chat_ctx)
-                    unlikely_threshold = self._turn_detector.unlikely_threshold()
-                    elasped = time.perf_counter() - start_time
-                    if eot_prob < unlikely_threshold:
-                        delay = self._max_endpointing_delay
-                    delay = max(0, delay - elasped)
-                except (TimeoutError, AssertionError):
-                    pass  # inference process is unresponsive
+
+            if use_turn_detector and self._turn_detector is not None:
+                if not self._turn_detector.supports_language(self._last_language):
+                    logger.debug(
+                        "turn detector does not support language",
+                        extra={"language": self._last_language},
+                    )
+                else:
+                    start_time = time.perf_counter()
+                    try:
+                        eot_prob = await self._turn_detector.predict_end_of_turn(
+                            chat_ctx
+                        )
+                        unlikely_threshold = self._turn_detector.unlikely_threshold(
+                            self._last_language
+                        )
+                        elasped = time.perf_counter() - start_time
+                        if eot_prob < unlikely_threshold:
+                            delay = self._max_endpointing_delay
+                        delay = max(0, delay - elasped)
+                    except (TimeoutError, AssertionError):
+                        pass  # inference process is unresponsive
 
             await asyncio.sleep(delay)
 
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py
@@ -698,16 +698,19 @@ def live_transcription_to_speech_data(
 ) -> List[stt.SpeechData]:
     dg_alts = data["channel"]["alternatives"]
 
-    return [
-        stt.SpeechData(
+    speech_data = []
+    for alt in dg_alts:
+        sd = stt.SpeechData(
             language=language,
             start_time=alt["words"][0]["start"] if alt["words"] else 0,
             end_time=alt["words"][-1]["end"] if alt["words"] else 0,
             confidence=alt["confidence"],
             text=alt["transcript"],
         )
-        for alt in dg_alts
-    ]
+        if language == "multi" and "languages" in alt:
+            sd.language = alt["languages"][0]  # TODO: handle multiple languages
+        speech_data.append(sd)
+    return speech_data
 
 
 def prerecorded_transcription_to_speech_event(
@@ -774,7 +777,6 @@ def _validate_model(
         "nova-2-drivethru",
         "nova-2-automotive",
         # nova-3 will support more languages, but english-only for now
-        "nova-3",
         "nova-3-general",
     }
     if language not in ("en-US", "en") and model in en_only_models:
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 
 from livekit.agents import Plugin
-from livekit.agents.inference_runner import _InferenceRunner
 
-from .eou import EOUModel, _EUORunner
+from .english import EnglishModel
 from .log import logger
 from .version import __version__
 
-__all__ = ["EOUModel", "__version__"]
+__all__ = ["EOUModel", "english", "multilingual", "__version__"]
 
 
 class EOUPlugin(Plugin):
@@ -29,13 +28,16 @@ def __init__(self):
     def download_files(self) -> None:
         from transformers import AutoTokenizer
 
-        from .eou import HG_MODEL, MODEL_REVISION, ONNX_FILENAME, _download_from_hf_hub
+        from .base import _download_from_hf_hub
+        from .models import HG_MODEL, MODEL_REVISIONS, ONNX_FILENAME
 
-        AutoTokenizer.from_pretrained(HG_MODEL, revision=MODEL_REVISION)
-        _download_from_hf_hub(
-            HG_MODEL, ONNX_FILENAME, subfolder="onnx", revision=MODEL_REVISION
-        )
+        for revision in MODEL_REVISIONS.values():
+            AutoTokenizer.from_pretrained(HG_MODEL, revision=revision)
+            _download_from_hf_hub(
+                HG_MODEL, ONNX_FILENAME, subfolder="onnx", revision=revision
+            )
+            _download_from_hf_hub(HG_MODEL, "languages.json", revision=revision)
 
 
 Plugin.register_plugin(EOUPlugin())
-_InferenceRunner.register_runner(_EUORunner)
+EOUModel = EnglishModel
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/base.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/base.py
@@ -3,17 +3,16 @@
 import asyncio
 import json
 import time
+from abc import ABC, abstractmethod
 
 from livekit.agents import llm
 from livekit.agents.inference_runner import _InferenceRunner
 from livekit.agents.ipc.inference_executor import InferenceExecutor
 from livekit.agents.job import get_current_job_context
 
 from .log import logger
+from .models import HG_MODEL, MODEL_REVISIONS, ONNX_FILENAME, EOUModelType
 
-HG_MODEL = "livekit/turn-detector"
-ONNX_FILENAME = "model_q8.onnx"
-MODEL_REVISION = "v1.2.1"
 MAX_HISTORY_TOKENS = 512
 MAX_HISTORY_TURNS = 6
 
@@ -25,8 +24,10 @@ def _download_from_hf_hub(repo_id, filename, **kwargs):
     return local_path
 
 
-class _EUORunner(_InferenceRunner):
-    INFERENCE_METHOD = "lk_end_of_utterance"
+class _EUORunnerBase(_InferenceRunner):
+    def __init__(self, model_type: EOUModelType):
+        super().__init__()
+        self._model_revision = MODEL_REVISIONS[model_type]
 
     def _format_chat_ctx(self, chat_ctx: dict):
         new_chat_ctx = []
@@ -60,7 +61,7 @@ def initialize(self) -> None:
                 HG_MODEL,
                 ONNX_FILENAME,
                 subfolder="onnx",
-                revision=MODEL_REVISION,
+                revision=self._model_revision,
                 local_files_only=True,
             )
             self._session = ort.InferenceSession(
@@ -69,19 +70,20 @@ def initialize(self) -> None:
 
             self._tokenizer = AutoTokenizer.from_pretrained(
                 HG_MODEL,
-                revision=MODEL_REVISION,
+                revision=self._model_revision,
                 local_files_only=True,
                 truncation_side="left",
             )
+
         except (errors.LocalEntryNotFoundError, OSError):
             logger.error(
                 (
-                    f"Could not find model {HG_MODEL}. Make sure you have downloaded the model before running the agent. "
+                    f"Could not find model {HG_MODEL} with revision {self._model_revision}. Make sure you have downloaded the model before running the agent. "
                     "Use `python3 your_agent.py download-files` to download the models."
                 )
             )
             raise RuntimeError(
-                f"livekit-plugins-turn-detector initialization failed. Could not find model {HG_MODEL}."
+                f"livekit-plugins-turn-detector initialization failed. Could not find model {HG_MODEL} with revision {self._model_revision}."
             ) from None
 
     def run(self, data: bytes) -> bytes | None:
@@ -116,26 +118,44 @@ def run(self, data: bytes) -> bytes | None:
         return json.dumps(data).encode()
 
 
-class EOUModel:
+class EOUModelBase(ABC):
     def __init__(
         self,
+        model_type: EOUModelType = "en",  # default to smaller, english-only model
         inference_executor: InferenceExecutor | None = None,
-        unlikely_threshold: float = 0.0289,
     ) -> None:
+        self._model_type = model_type
         self._executor = (
             inference_executor or get_current_job_context().inference_executor
         )
-        self._unlikely_threshold = unlikely_threshold
 
-    def unlikely_threshold(self) -> float:
-        return self._unlikely_threshold
+        config_fname = _download_from_hf_hub(
+            HG_MODEL,
+            "languages.json",
+            revision=MODEL_REVISIONS[self._model_type],
+            local_files_only=True,
+        )
+        with open(config_fname, "r") as f:
+            self._languages = json.load(f)
 
-    def supports_language(self, language: str | None) -> bool:
+    @abstractmethod
+    def _inference_method(self): ...
+
+    def unlikely_threshold(self, language: str | None) -> float | None:
         if language is None:
-            return False
-        parts = language.lower().split("-")
-        # certain models use language codes (DG, AssemblyAI), others use full names (like OAI)
-        return parts[0] == "en" or parts[0] == "english"
+            return None
+        lang = language.lower()
+        if lang in self._languages:
+            return self._languages[lang]["threshold"]
+        if "-" in lang:
+            part = lang.split("-")[0]
+            if part in self._languages:
+                return self._languages[part]["threshold"]
+        logger.warning(f"Language {language} not supported by EOU model")
+        return None
+
+    def supports_language(self, language: str | None) -> bool:
+        return self.unlikely_threshold(language) is not None
 
     async def predict_eou(self, chat_ctx: llm.ChatContext) -> float:
         return await self.predict_end_of_turn(chat_ctx)
@@ -173,7 +193,7 @@ async def predict_end_of_turn(
         json_data = json.dumps({"chat_ctx": messages}).encode()
 
         result = await asyncio.wait_for(
-            self._executor.do_inference(_EUORunner.INFERENCE_METHOD, json_data),
+            self._executor.do_inference(self._inference_method(), json_data),
             timeout=timeout,
         )
 
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/english.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/english.py
@@ -0,0 +1,21 @@
+from livekit.agents.inference_runner import _InferenceRunner
+
+from .base import EOUModelBase, _EUORunnerBase
+
+
+class _EUORunnerEn(_EUORunnerBase):
+    INFERENCE_METHOD = "lk_end_of_utterance_en"
+
+    def __init__(self):
+        super().__init__("en")
+
+
+class EnglishModel(EOUModelBase):
+    def __init__(self):
+        super().__init__(model_type="en")
+
+    def _inference_method(self) -> str:
+        return _EUORunnerEn.INFERENCE_METHOD
+
+
+_InferenceRunner.register_runner(_EUORunnerEn)
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/models.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/models.py
@@ -0,0 +1,9 @@
+from typing import Literal
+
+EOUModelType = Literal["en", "multilingual"]
+MODEL_REVISIONS: dict[EOUModelType, str] = {
+    "en": "v1.2.2-en",
+    "multilingual": "v0.1.0-intl",
+}
+HG_MODEL = "livekit/turn-detector"
+ONNX_FILENAME = "model_q8.onnx"
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/multilingual.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/multilingual.py
@@ -0,0 +1,21 @@
+from livekit.agents.inference_runner import _InferenceRunner
+
+from .base import EOUModelBase, _EUORunnerBase
+
+
+class _EUORunnerMultilingual(_EUORunnerBase):
+    INFERENCE_METHOD = "lk_end_of_utterance_multilingual"
+
+    def __init__(self):
+        super().__init__("multilingual")
+
+
+class MultilingualModel(EOUModelBase):
+    def __init__(self):
+        super().__init__(model_type="multilingual")
+
+    def _inference_method(self) -> str:
+        return _EUORunnerMultilingual.INFERENCE_METHOD
+
+
+_InferenceRunner.register_runner(_EUORunnerMultilingual)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"livekit-plugins-turn-detector": patch
 +---
++
 +added a multilingual turn detector option