Skip to content

Commit 5896742

Browse files
authored
fix multimodal agent interrupts itself during function call (#1585)
1 parent c6291d5 commit 5896742

File tree

5 files changed

+35
-1
lines changed

5 files changed

+35
-1
lines changed

.changeset/nervous-birds-relax.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"livekit-plugins-openai": patch
3+
---
4+
5+
fix multimodal agent interrupts itself when creating function call response

examples/multimodal-agent/openai_agent.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,10 @@ async def get_weather(
8181
model=openai.realtime.RealtimeModel(
8282
voice="alloy",
8383
temperature=0.8,
84-
instructions="You are a helpful assistant, greet the user and help them with their trip planning",
84+
instructions=(
85+
"You are a helpful assistant, greet the user and help them with their trip planning. "
86+
"When performing function calls, let user know that you are checking the weather."
87+
),
8588
turn_detection=openai.realtime.ServerVadOptions(
8689
threshold=0.6, prefix_padding_ms=200, silence_duration_ms=500
8790
),

livekit-agents/livekit/agents/multimodal/multimodal_agent.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@ def _truncate_conversation_item(
121121
self, item_id: str, content_index: int, audio_end_ms: int
122122
) -> None: ...
123123

124+
@property
125+
def playout_complete(self) -> asyncio.Event | None:
126+
"""Event that is set when the playout is done"""
127+
pass
128+
124129

125130
@dataclass(frozen=True)
126131
class AgentTranscriptionOptions:
@@ -435,10 +440,16 @@ async def _main_task(self) -> None:
435440
)
436441

437442
def _on_playout_started() -> None:
443+
if self._session.playout_complete is not None:
444+
self._session.playout_complete.clear()
445+
438446
self.emit("agent_started_speaking")
439447
self._update_state("speaking")
440448

441449
def _on_playout_stopped(interrupted: bool) -> None:
450+
if self._session.playout_complete is not None:
451+
self._session.playout_complete.set()
452+
442453
self.emit("agent_stopped_speaking")
443454
self._update_state("listening")
444455

livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,8 @@ def __init__(
258258
self._fnc_ctx = fnc_ctx
259259
self._fnc_tasks = utils.aio.TaskSet()
260260
self._is_interrupted = False
261+
self._playout_complete = asyncio.Event()
262+
self._playout_complete.set()
261263

262264
tools = []
263265
if self._fnc_ctx is not None:
@@ -317,6 +319,10 @@ async def aclose(self) -> None:
317319
self._send_ch.close()
318320
await self._main_atask
319321

322+
@property
323+
def playout_complete(self) -> asyncio.Event | None:
324+
return self._playout_complete
325+
320326
@property
321327
def fnc_ctx(self) -> llm.FunctionContext | None:
322328
return self._fnc_ctx

livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,8 @@ def __init__(
870870
self._pending_responses: dict[str, RealtimeResponse] = {}
871871
self._active_response_id: str | None = None
872872
self._response_create_fut: asyncio.Future[None] | None = None
873+
self._playout_complete = asyncio.Event()
874+
self._playout_complete.set()
873875

874876
self._session_id = "not-connected"
875877
self.session_update() # initial session init
@@ -886,6 +888,10 @@ async def aclose(self) -> None:
886888
self._send_ch.close()
887889
await self._main_atask
888890

891+
@property
892+
def playout_complete(self) -> asyncio.Event:
893+
return self._playout_complete
894+
889895
@property
890896
def fnc_ctx(self) -> llm.FunctionContext | None:
891897
return self._fnc_ctx
@@ -1746,6 +1752,9 @@ async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str)
17461752
called_fnc = fnc_call_info.execute()
17471753
await called_fnc.task
17481754

1755+
# wait for the audio to be played before creating the response
1756+
await self._playout_complete.wait()
1757+
17491758
tool_call = llm.ChatMessage.create_tool_from_called_function(called_fnc)
17501759
logger.info(
17511760
"creating response for tool call",

0 commit comments

Comments
 (0)