@@ -34,6 +34,7 @@ class ClientToOrchestratorEvent(str, Enum):
3434 # User text message.
3535 USER_MESSAGE = "user_message"
3636 USER_ACTIVITY = "user_activity"
37+ MULTIMODAL_MESSAGE = "multimodal_message"
3738
3839
3940class AgentChatResponsePartType (str , Enum ):
@@ -74,6 +75,38 @@ def to_dict(self) -> dict:
7475 return {"type" : self .type , "text" : self .text }
7576
7677
78+ class MultimodalMessageFile :
79+ """File component of a multimodal message."""
80+
81+ def __init__ (self , file_id : str ):
82+ self .type : Literal ["file_input" ] = "file_input"
83+ self .file_id = file_id
84+
85+ def to_dict (self ) -> dict :
86+ return {"type" : self .type , "file_id" : self .file_id }
87+
88+
89+ class MultimodalMessageClientToOrchestratorEvent :
90+ """Event for sending multimodal messages combining text and a file reference."""
91+
92+ def __init__ (
93+ self ,
94+ text : Optional [str ] = None ,
95+ file_id : Optional [str ] = None ,
96+ ):
97+ self .type : Literal [ClientToOrchestratorEvent .MULTIMODAL_MESSAGE ] = ClientToOrchestratorEvent .MULTIMODAL_MESSAGE
98+ self .text = text
99+ self .file_id = file_id
100+
101+ def to_dict (self ) -> dict :
102+ result : Dict [str , Any ] = {"type" : self .type }
103+ if self .text :
104+ result ["text" ] = UserMessageClientToOrchestratorEvent (text = self .text ).to_dict ()
105+ if self .file_id :
106+ result ["file" ] = MultimodalMessageFile (file_id = self .file_id ).to_dict ()
107+ return result
108+
109+
77110class AudioInterface (ABC ):
78111 """AudioInterface provides an abstraction for handling audio input and output."""
79112
@@ -736,6 +769,33 @@ def send_contextual_update(self, text: str):
736769 logger .error (f"Error sending contextual update: { e } " )
737770 raise
738771
772+ def send_multimodal_message (
773+ self ,
774+ text : Optional [str ] = None ,
775+ file_id : Optional [str ] = None ,
776+ ):
777+ """Send a multimodal message combining text and/or a file reference.
778+
779+ Args:
780+ text: Optional text message to include.
781+ file_id: Optional file ID to include (must be a previously uploaded file).
782+
783+ Raises:
784+ RuntimeError: If the session is not active or websocket is not connected.
785+ ValueError: If neither text nor file_id is provided.
786+ """
787+ if not self ._ws :
788+ raise RuntimeError ("Session not started or websocket not connected." )
789+ if not text and not file_id :
790+ raise ValueError ("At least one of text or file_id must be provided." )
791+
792+ event = MultimodalMessageClientToOrchestratorEvent (text = text , file_id = file_id )
793+ try :
794+ self ._ws .send (json .dumps (event .to_dict ()))
795+ except Exception as e :
796+ logger .error (f"Error sending multimodal message: { e } " )
797+ raise
798+
739799 def _run (self , ws_url : str ):
740800 with connect (ws_url , max_size = 16 * 1024 * 1024 ) as ws :
741801 self ._ws = ws
@@ -1001,6 +1061,33 @@ async def send_contextual_update(self, text: str):
10011061 logger .error (f"Error sending contextual update: { e } " )
10021062 raise
10031063
1064+ async def send_multimodal_message (
1065+ self ,
1066+ text : Optional [str ] = None ,
1067+ file_id : Optional [str ] = None ,
1068+ ):
1069+ """Send a multimodal message combining text and/or a file reference.
1070+
1071+ Args:
1072+ text: Optional text message to include.
1073+ file_id: Optional file ID to include (must be a previously uploaded file).
1074+
1075+ Raises:
1076+ RuntimeError: If the session is not active or websocket is not connected.
1077+ ValueError: If neither text nor file_id is provided.
1078+ """
1079+ if not self ._ws :
1080+ raise RuntimeError ("Session not started or websocket not connected." )
1081+ if not text and not file_id :
1082+ raise ValueError ("At least one of text or file_id must be provided." )
1083+
1084+ event = MultimodalMessageClientToOrchestratorEvent (text = text , file_id = file_id )
1085+ try :
1086+ await self ._ws .send (json .dumps (event .to_dict ()))
1087+ except Exception as e :
1088+ logger .error (f"Error sending multimodal message: { e } " )
1089+ raise
1090+
10041091 async def _run (self , ws_url : str ):
10051092 async with websockets .connect (ws_url , max_size = 16 * 1024 * 1024 ) as ws :
10061093 self ._ws = ws
0 commit comments