microsoft · moonbox3 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
@@ -37,7 +37,7 @@ dependencies = [
     "numpy >= 1.25.0; python_version < '3.12'",
     "numpy >= 1.26.0; python_version >= '3.12'",
     # openai connector
-    "openai >= 1.98.0, < 2.0.0",
+    "openai >= 2.0.0",
     # openapi and swagger
     "openapi_core >= 0.18,<0.20",
     "websockets >= 13, < 16",

@@ -103,6 +103,7 @@ async def main() -> None:
     flowery prose.
     """,
         voice="alloy",
+        output_modalities=["text", "audio"],
         turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
         function_choice_behavior=FunctionChoiceBehavior.Auto(),
     )

@@ -9,6 +9,7 @@
     OpenAIRealtimeExecutionSettings,
     OpenAIRealtimeWebRTC,
 )
+from semantic_kernel.contents import RealtimeTextEvent
 
 logging.basicConfig(level=logging.WARNING)
 utils_log = logging.getLogger("samples.concepts.realtime.utils")
@@ -55,23 +56,30 @@ async def main() -> None:
         # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice
         # for more details.
         voice="alloy",
+        # Enable both text and audio output to get transcripts
+        output_modalities=["text", "audio"],
     )
     realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings)
     # Create the settings for the session
     audio_player = AudioPlayerWebRTC()
     # the context manager calls the create_session method on the client and starts listening to the audio stream
     async with audio_player, realtime_client:
         async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
-            match event.event_type:
-                case "text":
-                    # the model returns both audio and transcript of the audio, which we will print
-                    print(event.text.text, end="")
-                case "service":
-                    # OpenAI Specific events
-                    if event.service_type == ListenEvents.SESSION_UPDATED:
-                        print("Session updated")
-                    if event.service_type == ListenEvents.RESPONSE_CREATED:
-                        print("\nMosscap (transcript): ", end="")
+            match event:
+                case RealtimeTextEvent():
+                    # Only process delta events for streaming, skip done events to avoid duplication
+                    if event.service_type and "delta" in event.service_type and event.text.text:
+                        print(event.text.text, end="", flush=True)
+                    # Add newline when transcript is complete (done event)
+                    elif event.service_type and "done" in event.service_type:
+                        print()  # Add newline for readability
+                case _:
+                    # Handle service events
+                    if event.event_type == "service" and event.service_type:
+                        if event.service_type == ListenEvents.SESSION_UPDATED:
+                            print("Session updated")
+                        elif event.service_type == ListenEvents.RESPONSE_CREATED:
+                            print("\nMosscap (transcript): ", end="")
 
 
 if __name__ == "__main__":

@@ -55,6 +55,7 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
     """Request settings for OpenAI realtime services."""
 
     modalities: Sequence[Literal["audio", "text"]] | None = None
+    output_modalities: Sequence[Literal["audio", "text"]] | None = None
     ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None
     instructions: str | None = None
     voice: str | None = None
@@ -80,6 +81,49 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
     max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
     input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None
 
+    def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
+        """Prepare the settings as a dictionary for sending to the AI service.
+
+        For realtime settings, we need to properly structure the audio configuration
+        to match the OpenAI API expectations where voice and turn_detection are nested
+        under the audio field.
+        """
+        # Get the base settings dict (excludes service_id, extension_data, etc.)
+        settings_dict = super().prepare_settings_dict(**kwargs)
+
+        # Build the audio configuration object
+        audio_config: dict[str, Any] = {}
+
+        # Handle voice (goes in audio.output.voice)
+        if "voice" in settings_dict:
+            audio_config.setdefault("output", {})["voice"] = settings_dict.pop("voice")
+
+        # Handle turn_detection (goes in audio.input.turn_detection)
+        if "turn_detection" in settings_dict:
+            audio_config.setdefault("input", {})["turn_detection"] = settings_dict.pop("turn_detection")
+
+        # Handle input audio format
+        if "input_audio_format" in settings_dict:
+            audio_config.setdefault("input", {})["format"] = settings_dict.pop("input_audio_format")
+
+        # Handle output audio format
+        if "output_audio_format" in settings_dict:
+            audio_config.setdefault("output", {})["format"] = settings_dict.pop("output_audio_format")
+
+        # Handle input audio transcription
+        if "input_audio_transcription" in settings_dict:
+            audio_config.setdefault("input", {})["transcription"] = settings_dict.pop("input_audio_transcription")
+
+        # Handle input audio noise reduction
+        if "input_audio_noise_reduction" in settings_dict:
+            audio_config.setdefault("input", {})["noise_reduction"] = settings_dict.pop("input_audio_noise_reduction")
+
+        # Add the audio config if it has any content
+        if audio_config:
+            settings_dict["audio"] = audio_config
+
+        return settings_dict
+
 
 class AzureRealtimeExecutionSettings(OpenAIRealtimeExecutionSettings):
     """Request settings for Azure OpenAI realtime services."""