Skip to content

Commit 0923888

Browse files
authored
Merge pull request #132 from ks6088ts-labs/feature/issue-131_gpt-realtime
add gpt-realtime examples
2 parents 93b6f12 + 2b2f050 commit 0923888

File tree

5 files changed

+422
-39
lines changed

5 files changed

+422
-39
lines changed

docs/references.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,13 @@
6565
- [How To Install libportaudio2 on Ubuntu 22.04](https://www.installati.one/install-libportaudio2-ubuntu-22-04/): `sudo apt-get -y install libportaudio2`
6666
- [python-sounddevice](https://github.com/spatialaudio/python-sounddevice)
6767
- [python-soundfile](https://github.com/bastibe/python-soundfile)
68+
69+
### Realtime API
70+
71+
- [August 2025 / Realtime API audio model GA](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/whats-new#realtime-api-audio-model-ga)
72+
- [Global Standard model availability](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions#global-standard-model-availability)
73+
- [specification/cognitiveservices/data-plane/AzureOpenAI/inference/preview/2025-04-01-preview/inference.json](https://github.com/Azure/azure-rest-api-specs/blob/main/specification/cognitiveservices/data-plane/AzureOpenAI/inference/preview/2025-04-01-preview/inference.json)
74+
- [Realtime API with WebSocket](https://platform.openai.com/docs/guides/realtime-websocket)
75+
- [GPT-4o Realtime API for speech and audio](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/realtime-audio-quickstart?tabs=keyless%2Clinux&pivots=programming-language-python)
76+
- [OpenAI Python API library > examples/realtime](https://github.com/openai/openai-python/tree/main/examples/realtime)
77+
- [How to use the GPT-4o Realtime API via WebRTC](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/realtime-audio-webrtc)

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ dependencies = [
1212
"elasticsearch>=9.1.0",
1313
"fastapi[standard]>=0.116.1",
1414
"httpx>=0.28.1",
15+
"jinja2>=3.1.2",
1516
"langchain-azure-ai>=0.1.4",
1617
"langchain-community>=0.3.27",
1718
"langchain-mcp-adapters>=0.1.9",
@@ -20,7 +21,7 @@ dependencies = [
2021
"langchain-text-splitters>=0.3.9",
2122
"langgraph>=0.6.2",
2223
"langgraph-supervisor>=0.0.29",
23-
"openai>=1.98.0",
24+
"openai[realtime]>=1.98.0",
2425
"opentelemetry-api>=1.36.0",
2526
"opentelemetry-exporter-otlp>=1.36.0",
2627
"opentelemetry-sdk>=1.36.0",

scripts/realtime_operator.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import asyncio
2+
import http.server
3+
import json
4+
import logging
5+
import os
6+
import socketserver
7+
import tempfile
8+
import webbrowser
9+
from pathlib import Path
10+
11+
# New imports for template rendering and serving
12+
import jinja2
13+
import typer
14+
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
15+
from dotenv import load_dotenv
16+
from openai import AsyncAzureOpenAI
17+
18+
from template_langgraph.llms.azure_openais import Settings
19+
from template_langgraph.loggers import get_logger
20+
21+
# Initialize the Typer application
22+
app = typer.Typer(
23+
add_completion=False,
24+
help="Realtime API operator CLI",
25+
)
26+
27+
# Set up logging
28+
logger = get_logger(__name__)
29+
30+
31+
async def chat_impl() -> None:
32+
"""
33+
When prompted for user input, type a message and hit enter to send it to the model.
34+
Enter "q" to quit the conversation.
35+
"""
36+
credential = DefaultAzureCredential()
37+
token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
38+
settings = Settings()
39+
client = AsyncAzureOpenAI(
40+
azure_endpoint=settings.azure_openai_endpoint,
41+
azure_ad_token_provider=token_provider,
42+
api_version=settings.azure_openai_api_version,
43+
)
44+
async with client.realtime.connect(
45+
model="gpt-realtime", # name of your deployment
46+
) as connection:
47+
await connection.session.update(
48+
session={
49+
"output_modalities": [
50+
"text",
51+
"audio",
52+
],
53+
"model": "gpt-realtime",
54+
"type": "realtime",
55+
}
56+
)
57+
while True:
58+
user_input = input("Enter a message: ")
59+
if user_input == "q":
60+
break
61+
62+
await connection.conversation.item.create(
63+
item={
64+
"type": "message",
65+
"role": "user",
66+
"content": [
67+
{
68+
"type": "input_text",
69+
"text": user_input,
70+
},
71+
],
72+
}
73+
)
74+
await connection.response.create()
75+
async for event in connection:
76+
if event.type == "response.audio_transcript.delta":
77+
print(event.delta, end="", flush=True)
78+
elif event.type == "response.done":
79+
print()
80+
break
81+
else:
82+
logger.debug(f"event.type: {event.type}")
83+
# logger.debug(f"event: {event.model_dump_json(indent=2)}")
84+
85+
await credential.close()
86+
87+
88+
@app.command()
89+
def chat(
90+
verbose: bool = typer.Option(
91+
False,
92+
"--verbose",
93+
"-v",
94+
help="Enable verbose output",
95+
),
96+
):
97+
# Set up logging
98+
if verbose:
99+
logger.setLevel(logging.DEBUG)
100+
101+
asyncio.run(chat_impl())
102+
103+
104+
@app.command()
105+
def webrtc(
106+
template: str = typer.Option(
107+
"scripts/realtime_webrtc.html", "--template", "-t", help="Path to the HTML Jinja2 template"
108+
),
109+
host: str = typer.Option("0.0.0.0", "--host", "-h"),
110+
port: int = typer.Option(8080, "--port", "-p"),
111+
web_rtc_url: str = typer.Option(
112+
"https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc", "--webrtc-url", help="WebRTC endpoint URL"
113+
),
114+
sessions_url: str = typer.Option(
115+
"https://YourAzureOpenAIResourceName.openai.azure.com/openai/realtimeapi/sessions?api-version=2025-04-01-preview",
116+
"--sessions-url",
117+
help="Sessions API URL",
118+
),
119+
deployment: str = typer.Option("gpt-realtime", "--deployment", help="Deployment name"),
120+
voice: str = typer.Option("verse", "--voice", help="Voice name"),
121+
instructions: str = typer.Option(
122+
"You are a helpful AI assistant responding in natural, engaging language.",
123+
"--instructions",
124+
"-i",
125+
help="Initial assistant instructions for the realtime session",
126+
),
127+
):
128+
"""
129+
Render the realtime_webrtc HTML template with provided parameters and serve it as a static site.
130+
131+
The template is a Jinja2 template and will receive the following variables:
132+
- WEBRTC_URL, SESSIONS_URL, API_KEY, DEPLOYMENT, VOICE, INSTRUCTIONS
133+
"""
134+
settings = Settings()
135+
api_key = settings.azure_openai_api_key
136+
if not api_key:
137+
typer.secho(
138+
"Warning: no API key provided; the rendered page will contain an empty API key.", fg=typer.colors.YELLOW
139+
)
140+
141+
tpl_path = Path(template)
142+
if not tpl_path.exists():
143+
typer.secho(f"Template not found: {tpl_path}", fg=typer.colors.RED)
144+
raise typer.Exit(code=1)
145+
146+
tpl_text = tpl_path.read_text(encoding="utf-8")
147+
148+
# Use json.dumps to safely embed JS string literals in the template
149+
rendered = jinja2.Template(tpl_text).render(
150+
WEBRTC_URL=json.dumps(web_rtc_url),
151+
SESSIONS_URL=json.dumps(sessions_url),
152+
API_KEY=json.dumps(api_key),
153+
DEPLOYMENT=json.dumps(deployment),
154+
VOICE=json.dumps(voice),
155+
INSTRUCTIONS=json.dumps(instructions),
156+
)
157+
158+
tempdir = tempfile.TemporaryDirectory()
159+
out_path = Path(tempdir.name) / "index.html"
160+
out_path.write_text(rendered, encoding="utf-8")
161+
162+
# Serve the temporary directory with the rendered HTML
163+
os.chdir(tempdir.name)
164+
handler = http.server.SimpleHTTPRequestHandler
165+
with socketserver.TCPServer((host, port), handler) as httpd:
166+
url = f"http://{host}:{port}/"
167+
typer.secho(f"Serving rendered template at: {url}", fg=typer.colors.GREEN)
168+
try:
169+
webbrowser.open(url)
170+
except Exception:
171+
pass
172+
try:
173+
httpd.serve_forever()
174+
except KeyboardInterrupt:
175+
typer.secho("Shutting down server...", fg=typer.colors.YELLOW)
176+
finally:
177+
tempdir.cleanup()
178+
179+
180+
if __name__ == "__main__":
181+
load_dotenv(
182+
override=True,
183+
verbose=True,
184+
)
185+
app()

scripts/realtime_webrtc.html

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
6+
<title>Azure OpenAI Realtime Session</title>
7+
</head>
8+
<body>
9+
<h1>Azure OpenAI Realtime Session</h1>
10+
<p>
11+
WARNING: Don't use this code sample in production with the API key
12+
hardcoded. Use a protected backend service to call the sessions API and
13+
generate the ephemeral key. Then return the ephemeral key to the client.
14+
</p>
15+
<button onclick="StartSession()">Start Session</button>
16+
17+
<!-- Log container for API messages -->
18+
<div id="logContainer"></div>
19+
20+
<script>
21+
// The following constants are rendered from a Jinja2 template at runtime.
22+
// They must be valid JavaScript string literals (the CLI will json-encode them).
23+
const WEBRTC_URL = {{ WEBRTC_URL }};
24+
25+
// The SESSIONS_URL includes the Azure OpenAI resource URL,
26+
// deployment name, the /realtime/sessions path, and the API version.
27+
// The Azure OpenAI resource region isn't part of the SESSIONS_URL.
28+
const SESSIONS_URL = {{ SESSIONS_URL }};
29+
30+
// The API key of the Azure OpenAI resource. WARNING: do not expose this in
31+
// production. Use an ephemeral key instead.
32+
const API_KEY = {{ API_KEY }};
33+
34+
// The deployment name might not be the same as the model name.
35+
const DEPLOYMENT = {{ DEPLOYMENT }};
36+
const VOICE = {{ VOICE }};
37+
// Injected initial assistant instructions (JSON-encoded string)
38+
const INSTRUCTIONS = {{ INSTRUCTIONS }};
39+
40+
async function StartSession() {
41+
try {
42+
// WARNING: Don't use this code sample in production
43+
// with the API key hardcoded.
44+
// Use a protected backend service to call the
45+
// sessions API and generate the ephemeral key.
46+
// Then return the ephemeral key to the client.
47+
48+
const response = await fetch(SESSIONS_URL, {
49+
method: "POST",
50+
headers: {
51+
//"Authorization": `Bearer ${ACCESS_TOKEN}`,
52+
"api-key": API_KEY,
53+
"Content-Type": "application/json",
54+
},
55+
body: JSON.stringify({
56+
model: DEPLOYMENT,
57+
voice: VOICE,
58+
}),
59+
});
60+
61+
if (!response.ok) {
62+
throw new Error(`API request failed`);
63+
}
64+
65+
const data = await response.json();
66+
67+
const sessionId = data.id;
68+
const ephemeralKey = data.client_secret?.value;
69+
console.error("Ephemeral key:", ephemeralKey);
70+
71+
// Mask the ephemeral key in the log message.
72+
logMessage("Ephemeral Key Received: " + "***");
73+
logMessage("WebRTC Session Id = " + sessionId);
74+
75+
// Set up the WebRTC connection using the ephemeral key.
76+
init(ephemeralKey);
77+
} catch (error) {
78+
console.error("Error fetching ephemeral key:", error);
79+
logMessage("Error fetching ephemeral key: " + error.message);
80+
}
81+
}
82+
83+
async function init(ephemeralKey) {
84+
let peerConnection = new RTCPeerConnection();
85+
86+
// Set up to play remote audio from the model.
87+
const audioElement = document.createElement("audio");
88+
audioElement.autoplay = true;
89+
document.body.appendChild(audioElement);
90+
91+
peerConnection.ontrack = (event) => {
92+
audioElement.srcObject = event.streams[0];
93+
};
94+
95+
// Set up data channel for sending and receiving events
96+
const clientMedia = await navigator.mediaDevices.getUserMedia({
97+
audio: true,
98+
});
99+
const audioTrack = clientMedia.getAudioTracks()[0];
100+
peerConnection.addTrack(audioTrack);
101+
102+
const dataChannel =
103+
peerConnection.createDataChannel("realtime-channel");
104+
105+
dataChannel.addEventListener("open", () => {
106+
logMessage("Data channel is open");
107+
updateSession(dataChannel);
108+
});
109+
110+
dataChannel.addEventListener("message", (event) => {
111+
const realtimeEvent = JSON.parse(event.data);
112+
console.log(realtimeEvent);
113+
logMessage(
114+
"Received server event: " + JSON.stringify(realtimeEvent, null, 2)
115+
);
116+
if (realtimeEvent.type === "session.update") {
117+
const instructions = realtimeEvent.session.instructions;
118+
logMessage("Instructions: " + instructions);
119+
} else if (realtimeEvent.type === "session.error") {
120+
logMessage("Error: " + realtimeEvent.error.message);
121+
} else if (realtimeEvent.type === "session.end") {
122+
logMessage("Session ended.");
123+
}
124+
});
125+
126+
dataChannel.addEventListener("close", () => {
127+
logMessage("Data channel is closed");
128+
});
129+
130+
// Start the session using the Session Description Protocol (SDP)
131+
const offer = await peerConnection.createOffer();
132+
await peerConnection.setLocalDescription(offer);
133+
134+
const sdpResponse = await fetch(`${WEBRTC_URL}?model=${DEPLOYMENT}`, {
135+
method: "POST",
136+
body: offer.sdp,
137+
headers: {
138+
Authorization: `Bearer ${ephemeralKey}`,
139+
"Content-Type": "application/sdp",
140+
},
141+
});
142+
143+
const answer = { type: "answer", sdp: await sdpResponse.text() };
144+
await peerConnection.setRemoteDescription(answer);
145+
146+
const button = document.createElement("button");
147+
button.innerText = "Close Session";
148+
button.onclick = stopSession;
149+
document.body.appendChild(button);
150+
151+
// Send a client event to update the session
152+
function updateSession(dataChannel) {
153+
const event = {
154+
type: "session.update",
155+
session: {
156+
// Use the injected INSTRUCTIONS value so the CLI can control the prompt
157+
instructions: INSTRUCTIONS,
158+
},
159+
};
160+
dataChannel.send(JSON.stringify(event));
161+
logMessage("Sent client event: " + JSON.stringify(event, null, 2));
162+
}
163+
164+
function stopSession() {
165+
if (dataChannel) dataChannel.close();
166+
if (peerConnection) peerConnection.close();
167+
peerConnection = null;
168+
logMessage("Session closed.");
169+
}
170+
}
171+
172+
function logMessage(message) {
173+
const logContainer = document.getElementById("logContainer");
174+
const p = document.createElement("p");
175+
p.textContent = message;
176+
logContainer.appendChild(p);
177+
}
178+
</script>
179+
</body>
180+
</html>

0 commit comments

Comments
 (0)