working

bcherry · rektdeckard · commit 6e91e7af2657 · 2025-10-01T08:48:27.000-06:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,14 +9,11 @@ description = "Simple voice AI assistant built with LiveKit Agents for Python"
 requires-python = ">=3.9"
 
 dependencies = [
-    "livekit-agents[turn-detector,silero]~=1.2",
+    "livekit-agents[turn-detector,silero]~=1.2.11",
     "livekit-plugins-noise-cancellation~=0.2",
     "python-dotenv",
 ]
 
-[tool.uv.sources]
-livekit-agents = { git = "https://github.com/livekit/agents.git", branch = "longc/cloud-inference-draft", subdirectory = "livekit-agents"}
-
 [dependency-groups]
 dev = [
     "pytest",
diff --git a/src/agent.py b/src/agent.py
@@ -13,9 +13,10 @@
     WorkerOptions,
     cli,
     metrics,
+    inference,
 )
 from livekit.plugins import noise_cancellation, silero
-from livekit.plugins.turn_detector.multilingual import MultilingualModel
+from livekit.plugins.turn_detector.english import EnglishModel
 
 logger = logging.getLogger("agent")
 
@@ -50,8 +51,7 @@ def __init__(self) -> None:
 
 
 def prewarm(proc: JobProcess):
-    pass
-    # proc.userdata["vad"] = silero.VAD.load()
+    proc.userdata["vad"] = silero.VAD.load()
 
 
 async def entrypoint(ctx: JobContext):
@@ -65,17 +65,17 @@ async def entrypoint(ctx: JobContext):
     session = AgentSession(
         # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response
         # See all available models at https://docs.livekit.io/agents/models/llm/
-        llm="openai/gpt-4o-mini",
+        llm="azure/gpt-4o-mini",
         # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
         # See all available models at https://docs.livekit.io/agents/models/stt/
-        stt="deepgram/nova-3",
+        stt=inference.STT(language="en"),
         # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
         # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
         tts="cartesia/sonic-2:6f84f4b8-58a2-430c-8c79-688dad597532",
         # VAD and turn detection are used to determine when the user is speaking and when the agent should respond
         # See more at https://docs.livekit.io/agents/build/turns
-        # turn_detection=MultilingualModel(),
-        # vad=ctx.proc.userdata["vad"],
+        turn_detection=EnglishModel(),
+        vad=ctx.proc.userdata["vad"],
         # allow the LLM to generate a response while waiting for the end of turn
         # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
         preemptive_generation=True,