Skip to content

Commit 80fc757

Browse files
authored
add multimodal_message WebSocket event (#743)
1 parent 7c864bc commit 80fc757

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

src/elevenlabs/conversational_ai/conversation.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class ClientToOrchestratorEvent(str, Enum):
3434
# User text message.
3535
USER_MESSAGE = "user_message"
3636
USER_ACTIVITY = "user_activity"
37+
MULTIMODAL_MESSAGE = "multimodal_message"
3738

3839

3940
class AgentChatResponsePartType(str, Enum):
@@ -74,6 +75,38 @@ def to_dict(self) -> dict:
7475
return {"type": self.type, "text": self.text}
7576

7677

78+
class MultimodalMessageFile:
79+
"""File component of a multimodal message."""
80+
81+
def __init__(self, file_id: str):
82+
self.type: Literal["file_input"] = "file_input"
83+
self.file_id = file_id
84+
85+
def to_dict(self) -> dict:
86+
return {"type": self.type, "file_id": self.file_id}
87+
88+
89+
class MultimodalMessageClientToOrchestratorEvent:
90+
"""Event for sending multimodal messages combining text and a file reference."""
91+
92+
def __init__(
93+
self,
94+
text: Optional[str] = None,
95+
file_id: Optional[str] = None,
96+
):
97+
self.type: Literal[ClientToOrchestratorEvent.MULTIMODAL_MESSAGE] = ClientToOrchestratorEvent.MULTIMODAL_MESSAGE
98+
self.text = text
99+
self.file_id = file_id
100+
101+
def to_dict(self) -> dict:
102+
result: Dict[str, Any] = {"type": self.type}
103+
if self.text:
104+
result["text"] = UserMessageClientToOrchestratorEvent(text=self.text).to_dict()
105+
if self.file_id:
106+
result["file"] = MultimodalMessageFile(file_id=self.file_id).to_dict()
107+
return result
108+
109+
77110
class AudioInterface(ABC):
78111
"""AudioInterface provides an abstraction for handling audio input and output."""
79112

@@ -736,6 +769,33 @@ def send_contextual_update(self, text: str):
736769
logger.error(f"Error sending contextual update: {e}")
737770
raise
738771

772+
def send_multimodal_message(
773+
self,
774+
text: Optional[str] = None,
775+
file_id: Optional[str] = None,
776+
):
777+
"""Send a multimodal message combining text and/or a file reference.
778+
779+
Args:
780+
text: Optional text message to include.
781+
file_id: Optional file ID to include (must be a previously uploaded file).
782+
783+
Raises:
784+
RuntimeError: If the session is not active or websocket is not connected.
785+
ValueError: If neither text nor file_id is provided.
786+
"""
787+
if not self._ws:
788+
raise RuntimeError("Session not started or websocket not connected.")
789+
if not text and not file_id:
790+
raise ValueError("At least one of text or file_id must be provided.")
791+
792+
event = MultimodalMessageClientToOrchestratorEvent(text=text, file_id=file_id)
793+
try:
794+
self._ws.send(json.dumps(event.to_dict()))
795+
except Exception as e:
796+
logger.error(f"Error sending multimodal message: {e}")
797+
raise
798+
739799
def _run(self, ws_url: str):
740800
with connect(ws_url, max_size=16 * 1024 * 1024) as ws:
741801
self._ws = ws
@@ -1001,6 +1061,33 @@ async def send_contextual_update(self, text: str):
10011061
logger.error(f"Error sending contextual update: {e}")
10021062
raise
10031063

1064+
async def send_multimodal_message(
1065+
self,
1066+
text: Optional[str] = None,
1067+
file_id: Optional[str] = None,
1068+
):
1069+
"""Send a multimodal message combining text and/or a file reference.
1070+
1071+
Args:
1072+
text: Optional text message to include.
1073+
file_id: Optional file ID to include (must be a previously uploaded file).
1074+
1075+
Raises:
1076+
RuntimeError: If the session is not active or websocket is not connected.
1077+
ValueError: If neither text nor file_id is provided.
1078+
"""
1079+
if not self._ws:
1080+
raise RuntimeError("Session not started or websocket not connected.")
1081+
if not text and not file_id:
1082+
raise ValueError("At least one of text or file_id must be provided.")
1083+
1084+
event = MultimodalMessageClientToOrchestratorEvent(text=text, file_id=file_id)
1085+
try:
1086+
await self._ws.send(json.dumps(event.to_dict()))
1087+
except Exception as e:
1088+
logger.error(f"Error sending multimodal message: {e}")
1089+
raise
1090+
10041091
async def _run(self, ws_url: str):
10051092
async with websockets.connect(ws_url, max_size=16 * 1024 * 1024) as ws:
10061093
self._ws = ws

0 commit comments

Comments
 (0)