Merge pull request #132 from ks6088ts-labs/feature/issue-131_gpt-realtime

ks6088ts · web-flow · commit 09238883d555 · 2025-09-04T12:56:43.000+09:00
add gpt-realtime examples
diff --git a/docs/references.md b/docs/references.md
@@ -65,3 +65,13 @@
 - [How To Install libportaudio2 on Ubuntu 22.04](https://www.installati.one/install-libportaudio2-ubuntu-22-04/): `sudo apt-get -y install libportaudio2`
 - [python-sounddevice](https://github.com/spatialaudio/python-sounddevice)
 - [python-soundfile](https://github.com/bastibe/python-soundfile)
+
+### Realtime API
+
+- [August 2025 / Realtime API audio model GA](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/whats-new#realtime-api-audio-model-ga)
+- [Global Standard model availability](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions#global-standard-model-availability)
+- [specification/cognitiveservices/data-plane/AzureOpenAI/inference/preview/2025-04-01-preview/inference.json](https://github.com/Azure/azure-rest-api-specs/blob/main/specification/cognitiveservices/data-plane/AzureOpenAI/inference/preview/2025-04-01-preview/inference.json)
+- [Realtime API with WebSocket](https://platform.openai.com/docs/guides/realtime-websocket)
+- [GPT-4o Realtime API for speech and audio](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/realtime-audio-quickstart?tabs=keyless%2Clinux&pivots=programming-language-python)
+- [OpenAI Python API library > examples/realtime](https://github.com/openai/openai-python/tree/main/examples/realtime)
+- [How to use the GPT-4o Realtime API via WebRTC](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/realtime-audio-webrtc)
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ dependencies = [
     "elasticsearch>=9.1.0",
     "fastapi[standard]>=0.116.1",
     "httpx>=0.28.1",
+    "jinja2>=3.1.2",
     "langchain-azure-ai>=0.1.4",
     "langchain-community>=0.3.27",
     "langchain-mcp-adapters>=0.1.9",
@@ -20,7 +21,7 @@ dependencies = [
     "langchain-text-splitters>=0.3.9",
     "langgraph>=0.6.2",
     "langgraph-supervisor>=0.0.29",
-    "openai>=1.98.0",
+    "openai[realtime]>=1.98.0",
     "opentelemetry-api>=1.36.0",
     "opentelemetry-exporter-otlp>=1.36.0",
     "opentelemetry-sdk>=1.36.0",
diff --git a/scripts/realtime_operator.py b/scripts/realtime_operator.py
@@ -0,0 +1,185 @@
+import asyncio
+import http.server
+import json
+import logging
+import os
+import socketserver
+import tempfile
+import webbrowser
+from pathlib import Path
+
+# New imports for template rendering and serving
+import jinja2
+import typer
+from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
+from dotenv import load_dotenv
+from openai import AsyncAzureOpenAI
+
+from template_langgraph.llms.azure_openais import Settings
+from template_langgraph.loggers import get_logger
+
+# Initialize the Typer application
+app = typer.Typer(
+    add_completion=False,
+    help="Realtime API operator CLI",
+)
+
+# Set up logging
+logger = get_logger(__name__)
+
+
+async def chat_impl() -> None:
+    """
+    When prompted for user input, type a message and hit enter to send it to the model.
+    Enter "q" to quit the conversation.
+    """
+    credential = DefaultAzureCredential()
+    token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
+    settings = Settings()
+    client = AsyncAzureOpenAI(
+        azure_endpoint=settings.azure_openai_endpoint,
+        azure_ad_token_provider=token_provider,
+        api_version=settings.azure_openai_api_version,
+    )
+    async with client.realtime.connect(
+        model="gpt-realtime",  # name of your deployment
+    ) as connection:
+        await connection.session.update(
+            session={
+                "output_modalities": [
+                    "text",
+                    "audio",
+                ],
+                "model": "gpt-realtime",
+                "type": "realtime",
+            }
+        )
+        while True:
+            user_input = input("Enter a message: ")
+            if user_input == "q":
+                break
+
+            await connection.conversation.item.create(
+                item={
+                    "type": "message",
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": user_input,
+                        },
+                    ],
+                }
+            )
+            await connection.response.create()
+            async for event in connection:
+                if event.type == "response.audio_transcript.delta":
+                    print(event.delta, end="", flush=True)
+                elif event.type == "response.done":
+                    print()
+                    break
+                else:
+                    logger.debug(f"event.type: {event.type}")
+                    # logger.debug(f"event: {event.model_dump_json(indent=2)}")
+
+    await credential.close()
+
+
+@app.command()
+def chat(
+    verbose: bool = typer.Option(
+        False,
+        "--verbose",
+        "-v",
+        help="Enable verbose output",
+    ),
+):
+    # Set up logging
+    if verbose:
+        logger.setLevel(logging.DEBUG)
+
+    asyncio.run(chat_impl())
+
+
+@app.command()
+def webrtc(
+    template: str = typer.Option(
+        "scripts/realtime_webrtc.html", "--template", "-t", help="Path to the HTML Jinja2 template"
+    ),
+    host: str = typer.Option("0.0.0.0", "--host", "-h"),
+    port: int = typer.Option(8080, "--port", "-p"),
+    web_rtc_url: str = typer.Option(
+        "https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc", "--webrtc-url", help="WebRTC endpoint URL"
+    ),
+    sessions_url: str = typer.Option(
+        "https://YourAzureOpenAIResourceName.openai.azure.com/openai/realtimeapi/sessions?api-version=2025-04-01-preview",
+        "--sessions-url",
+        help="Sessions API URL",
+    ),
+    deployment: str = typer.Option("gpt-realtime", "--deployment", help="Deployment name"),
+    voice: str = typer.Option("verse", "--voice", help="Voice name"),
+    instructions: str = typer.Option(
+        "You are a helpful AI assistant responding in natural, engaging language.",
+        "--instructions",
+        "-i",
+        help="Initial assistant instructions for the realtime session",
+    ),
+):
+    """
+    Render the realtime_webrtc HTML template with provided parameters and serve it as a static site.
+
+    The template is a Jinja2 template and will receive the following variables:
+    - WEBRTC_URL, SESSIONS_URL, API_KEY, DEPLOYMENT, VOICE, INSTRUCTIONS
+    """
+    settings = Settings()
+    api_key = settings.azure_openai_api_key
+    if not api_key:
+        typer.secho(
+            "Warning: no API key provided; the rendered page will contain an empty API key.", fg=typer.colors.YELLOW
+        )
+
+    tpl_path = Path(template)
+    if not tpl_path.exists():
+        typer.secho(f"Template not found: {tpl_path}", fg=typer.colors.RED)
+        raise typer.Exit(code=1)
+
+    tpl_text = tpl_path.read_text(encoding="utf-8")
+
+    # Use json.dumps to safely embed JS string literals in the template
+    rendered = jinja2.Template(tpl_text).render(
+        WEBRTC_URL=json.dumps(web_rtc_url),
+        SESSIONS_URL=json.dumps(sessions_url),
+        API_KEY=json.dumps(api_key),
+        DEPLOYMENT=json.dumps(deployment),
+        VOICE=json.dumps(voice),
+        INSTRUCTIONS=json.dumps(instructions),
+    )
+
+    tempdir = tempfile.TemporaryDirectory()
+    out_path = Path(tempdir.name) / "index.html"
+    out_path.write_text(rendered, encoding="utf-8")
+
+    # Serve the temporary directory with the rendered HTML
+    os.chdir(tempdir.name)
+    handler = http.server.SimpleHTTPRequestHandler
+    with socketserver.TCPServer((host, port), handler) as httpd:
+        url = f"http://{host}:{port}/"
+        typer.secho(f"Serving rendered template at: {url}", fg=typer.colors.GREEN)
+        try:
+            webbrowser.open(url)
+        except Exception:
+            pass
+        try:
+            httpd.serve_forever()
+        except KeyboardInterrupt:
+            typer.secho("Shutting down server...", fg=typer.colors.YELLOW)
+        finally:
+            tempdir.cleanup()
+
+
+if __name__ == "__main__":
+    load_dotenv(
+        override=True,
+        verbose=True,
+    )
+    app()
diff --git a/scripts/realtime_webrtc.html b/scripts/realtime_webrtc.html
@@ -0,0 +1,180 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Azure OpenAI Realtime Session</title>
+  </head>
+  <body>
+    <h1>Azure OpenAI Realtime Session</h1>
+    <p>
+      WARNING: Don't use this code sample in production with the API key
+      hardcoded. Use a protected backend service to call the sessions API and
+      generate the ephemeral key. Then return the ephemeral key to the client.
+    </p>
+    <button onclick="StartSession()">Start Session</button>
+
+    <!-- Log container for API messages -->
+    <div id="logContainer"></div>
+
+    <script>
+      // The following constants are rendered from a Jinja2 template at runtime.
+      // They must be valid JavaScript string literals (the CLI will json-encode them).
+      const WEBRTC_URL = {{ WEBRTC_URL }};
+
+      // The SESSIONS_URL includes the Azure OpenAI resource URL,
+      // deployment name, the /realtime/sessions path, and the API version.
+      // The Azure OpenAI resource region isn't part of the SESSIONS_URL.
+      const SESSIONS_URL = {{ SESSIONS_URL }};
+
+      // The API key of the Azure OpenAI resource. WARNING: do not expose this in
+      // production. Use an ephemeral key instead.
+      const API_KEY = {{ API_KEY }};
+
+      // The deployment name might not be the same as the model name.
+      const DEPLOYMENT = {{ DEPLOYMENT }};
+      const VOICE = {{ VOICE }};
+      // Injected initial assistant instructions (JSON-encoded string)
+      const INSTRUCTIONS = {{ INSTRUCTIONS }};
+
+      async function StartSession() {
+        try {
+          // WARNING: Don't use this code sample in production
+          // with the API key hardcoded.
+          // Use a protected backend service to call the
+          // sessions API and generate the ephemeral key.
+          // Then return the ephemeral key to the client.
+
+          const response = await fetch(SESSIONS_URL, {
+            method: "POST",
+            headers: {
+              //"Authorization": `Bearer ${ACCESS_TOKEN}`,
+              "api-key": API_KEY,
+              "Content-Type": "application/json",
+            },
+            body: JSON.stringify({
+              model: DEPLOYMENT,
+              voice: VOICE,
+            }),
+          });
+
+          if (!response.ok) {
+            throw new Error(`API request failed`);
+          }
+
+          const data = await response.json();
+
+          const sessionId = data.id;
+          const ephemeralKey = data.client_secret?.value;
+          console.error("Ephemeral key:", ephemeralKey);
+
+          // Mask the ephemeral key in the log message.
+          logMessage("Ephemeral Key Received: " + "***");
+          logMessage("WebRTC Session Id = " + sessionId);
+
+          // Set up the WebRTC connection using the ephemeral key.
+          init(ephemeralKey);
+        } catch (error) {
+          console.error("Error fetching ephemeral key:", error);
+          logMessage("Error fetching ephemeral key: " + error.message);
+        }
+      }
+
+      async function init(ephemeralKey) {
+        let peerConnection = new RTCPeerConnection();
+
+        // Set up to play remote audio from the model.
+        const audioElement = document.createElement("audio");
+        audioElement.autoplay = true;
+        document.body.appendChild(audioElement);
+
+        peerConnection.ontrack = (event) => {
+          audioElement.srcObject = event.streams[0];
+        };
+
+        // Set up data channel for sending and receiving events
+        const clientMedia = await navigator.mediaDevices.getUserMedia({
+          audio: true,
+        });
+        const audioTrack = clientMedia.getAudioTracks()[0];
+        peerConnection.addTrack(audioTrack);
+
+        const dataChannel =
+          peerConnection.createDataChannel("realtime-channel");
+
+        dataChannel.addEventListener("open", () => {
+          logMessage("Data channel is open");
+          updateSession(dataChannel);
+        });
+
+        dataChannel.addEventListener("message", (event) => {
+          const realtimeEvent = JSON.parse(event.data);
+          console.log(realtimeEvent);
+          logMessage(
+            "Received server event: " + JSON.stringify(realtimeEvent, null, 2)
+          );
+          if (realtimeEvent.type === "session.update") {
+            const instructions = realtimeEvent.session.instructions;
+            logMessage("Instructions: " + instructions);
+          } else if (realtimeEvent.type === "session.error") {
+            logMessage("Error: " + realtimeEvent.error.message);
+          } else if (realtimeEvent.type === "session.end") {
+            logMessage("Session ended.");
+          }
+        });
+
+        dataChannel.addEventListener("close", () => {
+          logMessage("Data channel is closed");
+        });
+
+        // Start the session using the Session Description Protocol (SDP)
+        const offer = await peerConnection.createOffer();
+        await peerConnection.setLocalDescription(offer);
+
+        const sdpResponse = await fetch(`${WEBRTC_URL}?model=${DEPLOYMENT}`, {
+          method: "POST",
+          body: offer.sdp,
+          headers: {
+            Authorization: `Bearer ${ephemeralKey}`,
+            "Content-Type": "application/sdp",
+          },
+        });
+
+        const answer = { type: "answer", sdp: await sdpResponse.text() };
+        await peerConnection.setRemoteDescription(answer);
+
+        const button = document.createElement("button");
+        button.innerText = "Close Session";
+        button.onclick = stopSession;
+        document.body.appendChild(button);
+
+        // Send a client event to update the session
+        function updateSession(dataChannel) {
+          const event = {
+            type: "session.update",
+            session: {
+              // Use the injected INSTRUCTIONS value so the CLI can control the prompt
+              instructions: INSTRUCTIONS,
+            },
+          };
+          dataChannel.send(JSON.stringify(event));
+          logMessage("Sent client event: " + JSON.stringify(event, null, 2));
+        }
+
+        function stopSession() {
+          if (dataChannel) dataChannel.close();
+          if (peerConnection) peerConnection.close();
+          peerConnection = null;
+          logMessage("Session closed.");
+        }
+      }
+
+      function logMessage(message) {
+        const logContainer = document.getElementById("logContainer");
+        const p = document.createElement("p");
+        p.textContent = message;
+        logContainer.appendChild(p);
+      }
+    </script>
+  </body>
+</html>
diff --git a/uv.lock b/uv.lock