Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/references.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,13 @@
- [How To Install libportaudio2 on Ubuntu 22.04](https://www.installati.one/install-libportaudio2-ubuntu-22-04/): `sudo apt-get -y install libportaudio2`
- [python-sounddevice](https://github.com/spatialaudio/python-sounddevice)
- [python-soundfile](https://github.com/bastibe/python-soundfile)

### Realtime API

- [August 2025 / Realtime API audio model GA](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/whats-new#realtime-api-audio-model-ga)
- [Global Standard model availability](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions#global-standard-model-availability)
- [specification/cognitiveservices/data-plane/AzureOpenAI/inference/preview/2025-04-01-preview/inference.json](https://github.com/Azure/azure-rest-api-specs/blob/main/specification/cognitiveservices/data-plane/AzureOpenAI/inference/preview/2025-04-01-preview/inference.json)
- [Realtime API with WebSocket](https://platform.openai.com/docs/guides/realtime-websocket)
- [GPT-4o Realtime API for speech and audio](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/realtime-audio-quickstart?tabs=keyless%2Clinux&pivots=programming-language-python)
- [OpenAI Python API library > examples/realtime](https://github.com/openai/openai-python/tree/main/examples/realtime)
- [How to use the GPT-4o Realtime API via WebRTC](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/realtime-audio-webrtc)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies = [
"elasticsearch>=9.1.0",
"fastapi[standard]>=0.116.1",
"httpx>=0.28.1",
"jinja2>=3.1.2",
"langchain-azure-ai>=0.1.4",
"langchain-community>=0.3.27",
"langchain-mcp-adapters>=0.1.9",
Expand All @@ -20,7 +21,7 @@ dependencies = [
"langchain-text-splitters>=0.3.9",
"langgraph>=0.6.2",
"langgraph-supervisor>=0.0.29",
"openai>=1.98.0",
"openai[realtime]>=1.98.0",
"opentelemetry-api>=1.36.0",
"opentelemetry-exporter-otlp>=1.36.0",
"opentelemetry-sdk>=1.36.0",
Expand Down
185 changes: 185 additions & 0 deletions scripts/realtime_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import asyncio
import http.server
import json
import logging
import os
import socketserver
import tempfile
import webbrowser
from pathlib import Path

# New imports for template rendering and serving
import jinja2
import typer
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
from dotenv import load_dotenv
from openai import AsyncAzureOpenAI

from template_langgraph.llms.azure_openais import Settings
from template_langgraph.loggers import get_logger

# Initialize the Typer application
app = typer.Typer(
add_completion=False,
help="Realtime API operator CLI",
)

# Set up logging
logger = get_logger(__name__)


async def chat_impl() -> None:
"""
When prompted for user input, type a message and hit enter to send it to the model.
Enter "q" to quit the conversation.
"""
credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
settings = Settings()
client = AsyncAzureOpenAI(
azure_endpoint=settings.azure_openai_endpoint,
azure_ad_token_provider=token_provider,
api_version=settings.azure_openai_api_version,
)
async with client.realtime.connect(
model="gpt-realtime", # name of your deployment
) as connection:
await connection.session.update(
session={
"output_modalities": [
"text",
"audio",
],
"model": "gpt-realtime",
"type": "realtime",
}
)
while True:
user_input = input("Enter a message: ")
if user_input == "q":
break

await connection.conversation.item.create(
item={
"type": "message",
"role": "user",
"content": [
{
"type": "input_text",
"text": user_input,
},
],
}
)
await connection.response.create()
async for event in connection:
if event.type == "response.audio_transcript.delta":
print(event.delta, end="", flush=True)
elif event.type == "response.done":
print()
break
else:
logger.debug(f"event.type: {event.type}")
# logger.debug(f"event: {event.model_dump_json(indent=2)}")

await credential.close()


@app.command()
def chat(
verbose: bool = typer.Option(
False,
"--verbose",
"-v",
help="Enable verbose output",
),
):
# Set up logging
if verbose:
logger.setLevel(logging.DEBUG)

asyncio.run(chat_impl())


@app.command()
def webrtc(
template: str = typer.Option(
"scripts/realtime_webrtc.html", "--template", "-t", help="Path to the HTML Jinja2 template"
),
host: str = typer.Option("0.0.0.0", "--host", "-h"),
port: int = typer.Option(8080, "--port", "-p"),
web_rtc_url: str = typer.Option(
"https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc", "--webrtc-url", help="WebRTC endpoint URL"
),
sessions_url: str = typer.Option(
"https://YourAzureOpenAIResourceName.openai.azure.com/openai/realtimeapi/sessions?api-version=2025-04-01-preview",
"--sessions-url",
help="Sessions API URL",
),
deployment: str = typer.Option("gpt-realtime", "--deployment", help="Deployment name"),
voice: str = typer.Option("verse", "--voice", help="Voice name"),
instructions: str = typer.Option(
"You are a helpful AI assistant responding in natural, engaging language.",
"--instructions",
"-i",
help="Initial assistant instructions for the realtime session",
),
):
"""
Render the realtime_webrtc HTML template with provided parameters and serve it as a static site.

The template is a Jinja2 template and will receive the following variables:
- WEBRTC_URL, SESSIONS_URL, API_KEY, DEPLOYMENT, VOICE, INSTRUCTIONS
"""
settings = Settings()
api_key = settings.azure_openai_api_key
if not api_key:
typer.secho(
"Warning: no API key provided; the rendered page will contain an empty API key.", fg=typer.colors.YELLOW
)

tpl_path = Path(template)
if not tpl_path.exists():
typer.secho(f"Template not found: {tpl_path}", fg=typer.colors.RED)
raise typer.Exit(code=1)

tpl_text = tpl_path.read_text(encoding="utf-8")

# Use json.dumps to safely embed JS string literals in the template
rendered = jinja2.Template(tpl_text).render(
WEBRTC_URL=json.dumps(web_rtc_url),
SESSIONS_URL=json.dumps(sessions_url),
API_KEY=json.dumps(api_key),
DEPLOYMENT=json.dumps(deployment),
VOICE=json.dumps(voice),
INSTRUCTIONS=json.dumps(instructions),
)

tempdir = tempfile.TemporaryDirectory()
out_path = Path(tempdir.name) / "index.html"
out_path.write_text(rendered, encoding="utf-8")

# Serve the temporary directory with the rendered HTML
os.chdir(tempdir.name)
handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer((host, port), handler) as httpd:
url = f"http://{host}:{port}/"
typer.secho(f"Serving rendered template at: {url}", fg=typer.colors.GREEN)
try:
webbrowser.open(url)
except Exception:
pass
try:
httpd.serve_forever()
except KeyboardInterrupt:
typer.secho("Shutting down server...", fg=typer.colors.YELLOW)
finally:
tempdir.cleanup()


if __name__ == "__main__":
load_dotenv(
override=True,
verbose=True,
)
app()
180 changes: 180 additions & 0 deletions scripts/realtime_webrtc.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Azure OpenAI Realtime Session</title>
</head>
<body>
<h1>Azure OpenAI Realtime Session</h1>
<p>
WARNING: Don't use this code sample in production with the API key
hardcoded. Use a protected backend service to call the sessions API and
generate the ephemeral key. Then return the ephemeral key to the client.
</p>
<button onclick="StartSession()">Start Session</button>

<!-- Log container for API messages -->
<div id="logContainer"></div>

<script>
// The following constants are rendered from a Jinja2 template at runtime.
// They must be valid JavaScript string literals (the CLI will json-encode them).
const WEBRTC_URL = {{ WEBRTC_URL }};

// The SESSIONS_URL includes the Azure OpenAI resource URL,
// deployment name, the /realtime/sessions path, and the API version.
// The Azure OpenAI resource region isn't part of the SESSIONS_URL.
const SESSIONS_URL = {{ SESSIONS_URL }};

// The API key of the Azure OpenAI resource. WARNING: do not expose this in
// production. Use an ephemeral key instead.
const API_KEY = {{ API_KEY }};

// The deployment name might not be the same as the model name.
const DEPLOYMENT = {{ DEPLOYMENT }};
const VOICE = {{ VOICE }};
// Injected initial assistant instructions (JSON-encoded string)
const INSTRUCTIONS = {{ INSTRUCTIONS }};

async function StartSession() {
try {
// WARNING: Don't use this code sample in production
// with the API key hardcoded.
// Use a protected backend service to call the
// sessions API and generate the ephemeral key.
// Then return the ephemeral key to the client.

const response = await fetch(SESSIONS_URL, {
method: "POST",
headers: {
//"Authorization": `Bearer ${ACCESS_TOKEN}`,
"api-key": API_KEY,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: DEPLOYMENT,
voice: VOICE,
}),
});

if (!response.ok) {
throw new Error(`API request failed`);
}

const data = await response.json();

const sessionId = data.id;
const ephemeralKey = data.client_secret?.value;
console.error("Ephemeral key:", ephemeralKey);

// Mask the ephemeral key in the log message.
logMessage("Ephemeral Key Received: " + "***");
logMessage("WebRTC Session Id = " + sessionId);

// Set up the WebRTC connection using the ephemeral key.
init(ephemeralKey);
} catch (error) {
console.error("Error fetching ephemeral key:", error);
logMessage("Error fetching ephemeral key: " + error.message);
}
}

async function init(ephemeralKey) {
let peerConnection = new RTCPeerConnection();

// Set up to play remote audio from the model.
const audioElement = document.createElement("audio");
audioElement.autoplay = true;
document.body.appendChild(audioElement);

peerConnection.ontrack = (event) => {
audioElement.srcObject = event.streams[0];
};

// Set up data channel for sending and receiving events
const clientMedia = await navigator.mediaDevices.getUserMedia({
audio: true,
});
const audioTrack = clientMedia.getAudioTracks()[0];
peerConnection.addTrack(audioTrack);

const dataChannel =
peerConnection.createDataChannel("realtime-channel");

dataChannel.addEventListener("open", () => {
logMessage("Data channel is open");
updateSession(dataChannel);
});

dataChannel.addEventListener("message", (event) => {
const realtimeEvent = JSON.parse(event.data);
console.log(realtimeEvent);
logMessage(
"Received server event: " + JSON.stringify(realtimeEvent, null, 2)
);
if (realtimeEvent.type === "session.update") {
const instructions = realtimeEvent.session.instructions;
logMessage("Instructions: " + instructions);
} else if (realtimeEvent.type === "session.error") {
logMessage("Error: " + realtimeEvent.error.message);
} else if (realtimeEvent.type === "session.end") {
logMessage("Session ended.");
}
});

dataChannel.addEventListener("close", () => {
logMessage("Data channel is closed");
});

// Start the session using the Session Description Protocol (SDP)
const offer = await peerConnection.createOffer();
await peerConnection.setLocalDescription(offer);

const sdpResponse = await fetch(`${WEBRTC_URL}?model=${DEPLOYMENT}`, {
method: "POST",
body: offer.sdp,
headers: {
Authorization: `Bearer ${ephemeralKey}`,
"Content-Type": "application/sdp",
},
});

const answer = { type: "answer", sdp: await sdpResponse.text() };
await peerConnection.setRemoteDescription(answer);

const button = document.createElement("button");
button.innerText = "Close Session";
button.onclick = stopSession;
document.body.appendChild(button);

// Send a client event to update the session
function updateSession(dataChannel) {
const event = {
type: "session.update",
session: {
// Use the injected INSTRUCTIONS value so the CLI can control the prompt
instructions: INSTRUCTIONS,
},
};
dataChannel.send(JSON.stringify(event));
logMessage("Sent client event: " + JSON.stringify(event, null, 2));
}

function stopSession() {
if (dataChannel) dataChannel.close();
if (peerConnection) peerConnection.close();
peerConnection = null;
logMessage("Session closed.");
}
}

function logMessage(message) {
const logContainer = document.getElementById("logContainer");
const p = document.createElement("p");
p.textContent = message;
logContainer.appendChild(p);
}
</script>
</body>
</html>
Loading