Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ dependencies = [
"numpy >= 1.25.0; python_version < '3.12'",
"numpy >= 1.26.0; python_version >= '3.12'",
# openai connector
"openai >= 1.98.0, < 2.0.0",
"openai >= 2.0.0",
# openapi and swagger
"openapi_core >= 0.18,<0.20",
"websockets >= 13, < 16",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ async def main() -> None:
flowery prose.
""",
voice="alloy",
output_modalities=["text", "audio"],
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
function_choice_behavior=FunctionChoiceBehavior.Auto(),
)
Expand Down
28 changes: 18 additions & 10 deletions python/samples/concepts/realtime/simple_realtime_chat_webrtc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
OpenAIRealtimeExecutionSettings,
OpenAIRealtimeWebRTC,
)
from semantic_kernel.contents import RealtimeTextEvent

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
Expand Down Expand Up @@ -55,23 +56,30 @@ async def main() -> None:
# see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice
# for more details.
voice="alloy",
# Enable both text and audio output to get transcripts
output_modalities=["text", "audio"],
)
realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings)
# Create the settings for the session
audio_player = AudioPlayerWebRTC()
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with audio_player, realtime_client:
async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
match event.event_type:
case "text":
# the model returns both audio and transcript of the audio, which we will print
print(event.text.text, end="")
case "service":
# OpenAI Specific events
if event.service_type == ListenEvents.SESSION_UPDATED:
print("Session updated")
if event.service_type == ListenEvents.RESPONSE_CREATED:
print("\nMosscap (transcript): ", end="")
match event:
case RealtimeTextEvent():
# Only process delta events for streaming, skip done events to avoid duplication
if event.service_type and "delta" in event.service_type and event.text.text:
print(event.text.text, end="", flush=True)
# Add newline when transcript is complete (done event)
elif event.service_type and "done" in event.service_type:
print() # Add newline for readability
case _:
# Handle service events
if event.event_type == "service" and event.service_type:
if event.service_type == ListenEvents.SESSION_UPDATED:
print("Session updated")
elif event.service_type == ListenEvents.RESPONSE_CREATED:
print("\nMosscap (transcript): ", end="")


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
"""Request settings for OpenAI realtime services."""

modalities: Sequence[Literal["audio", "text"]] | None = None
output_modalities: Sequence[Literal["audio", "text"]] | None = None
ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None
instructions: str | None = None
voice: str | None = None
Expand All @@ -80,6 +81,49 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None

def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
"""Prepare the settings as a dictionary for sending to the AI service.

For realtime settings, we need to properly structure the audio configuration
to match the OpenAI API expectations where voice and turn_detection are nested
under the audio field.
"""
# Get the base settings dict (excludes service_id, extension_data, etc.)
settings_dict = super().prepare_settings_dict(**kwargs)

# Build the audio configuration object
audio_config: dict[str, Any] = {}

# Handle voice (goes in audio.output.voice)
if "voice" in settings_dict:
audio_config.setdefault("output", {})["voice"] = settings_dict.pop("voice")

# Handle turn_detection (goes in audio.input.turn_detection)
if "turn_detection" in settings_dict:
audio_config.setdefault("input", {})["turn_detection"] = settings_dict.pop("turn_detection")

# Handle input audio format
if "input_audio_format" in settings_dict:
audio_config.setdefault("input", {})["format"] = settings_dict.pop("input_audio_format")

# Handle output audio format
if "output_audio_format" in settings_dict:
audio_config.setdefault("output", {})["format"] = settings_dict.pop("output_audio_format")

# Handle input audio transcription
if "input_audio_transcription" in settings_dict:
audio_config.setdefault("input", {})["transcription"] = settings_dict.pop("input_audio_transcription")

# Handle input audio noise reduction
if "input_audio_noise_reduction" in settings_dict:
audio_config.setdefault("input", {})["noise_reduction"] = settings_dict.pop("input_audio_noise_reduction")

# Add the audio config if it has any content
if audio_config:
settings_dict["audio"] = audio_config

return settings_dict


class AzureRealtimeExecutionSettings(OpenAIRealtimeExecutionSettings):
"""Request settings for Azure OpenAI realtime services."""
Expand Down
Loading
Loading