Merge pull request #3647 from eric-urban/eur/realtime-python

JamesJBarnett · web-flow · commit 75d034915ed7 · 2025-03-21T09:56:10.000-07:00
update realtime python quickstart
diff --git a/articles/ai-services/openai/includes/realtime-python.md b/articles/ai-services/openai/includes/realtime-python.md
@@ -62,11 +62,15 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
     > [!TIP]
     > We recommend that you create and activate a new Python environment to use to install the packages you need for this tutorial. Don't install packages into your global python installation. You should always use a virtual or conda environment when installing python packages, otherwise you can break your global installation of Python.
 
-1. Install the real-time audio client library for Python with:
+
+1. Install the OpenAI Python client library with:
 
     ```console
-    pip install "https://github.com/Azure-Samples/aoai-realtime-audio-sdk/releases/download/py%2Fv0.5.3/rtclient-0.5.3.tar.gz"
+    pip install openai[realtime]
     ```
+    
+    > [!NOTE]
+    > This library is maintained by OpenAI. Refer to the [release history](https://github.com/openai/openai-python/releases) to track the latest updates to the library.
 
 1. For the **recommended** keyless authentication with Microsoft Entra ID, install the `azure-identity` package with:
 
@@ -78,122 +82,138 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
 
 [!INCLUDE [resource authentication](resource-authentication.md)]
 
+> [!CAUTION]
+> To use the recommended keyless authentication with the SDK, make sure that the `AZURE_OPENAI_API_KEY` environment variable isn't set. 
+
 ## Text in audio out
 
 ## [Microsoft Entra ID](#tab/keyless)
 
 1. Create the `text-in-audio-out.py` file with the following code:
 
     ```python
+    import os
     import base64
     import asyncio
-    from azure.identity.aio import DefaultAzureCredential
-    from rtclient import (
-        ResponseCreateMessage,
-        RTLowLevelClient,
-        ResponseCreateParams
-    )
-    
-    # Set environment variables or edit the corresponding values here.
-    endpoint = os.environ["AZURE_OPENAI_ENDPOINT"] or "https://<your-resource-name>.openai.azure.com/"
-    deployment = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] or "gpt-4o-mini-realtime-preview"
-    
-    async def text_in_audio_out():
-        async with RTLowLevelClient(
-            url=endpoint,
-            azure_deployment=deployment,
-            token_credential=DefaultAzureCredential(),
-        ) as client:
-            await client.send(
-                ResponseCreateMessage(
-                    response=ResponseCreateParams(
-                        modalities={"audio", "text"}, 
-                        instructions="Please assist the user."
-                    )
+    from openai import AsyncAzureOpenAI
+    from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
+    
+    async def main() -> None:
+        """
+        When prompted for user input, type a message and hit enter to send it to the model.
+        Enter "q" to quit the conversation.
+        """
+    
+        credential = DefaultAzureCredential()
+        token_provider=get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
+        client = AsyncAzureOpenAI(
+            azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+            azure_ad_token_provider=token_provider,
+            api_version="2024-10-01-preview",
+        )
+        async with client.beta.realtime.connect(
+            model="gpt-4o-realtime-preview",  # name of your deployment
+        ) as connection:
+            await connection.session.update(session={"modalities": ["text", "audio"]})  
+            while True:
+                user_input = input("Enter a message: ")
+                if user_input == "q":
+                    break
+    
+                await connection.conversation.item.create(
+                    item={
+                        "type": "message",
+                        "role": "user",
+                        "content": [{"type": "input_text", "text": user_input}],
+                    }
                 )
-            )
-            done = False
-            while not done:
-                message = await client.recv()
-                match message.type:
-                    case "response.done":
-                        done = True
-                    case "error":
-                        done = True
-                        print(message.error)
-                    case "response.audio_transcript.delta":
-                        print(f"Received text delta: {message.delta}")
-                    case "response.audio.delta":
-                        buffer = base64.b64decode(message.delta)
-                        print(f"Received {len(buffer)} bytes of audio data.")
-                    case _:
-                        pass
-    
-    async def main():
-        await text_in_audio_out()
+                await connection.response.create()
+                async for event in connection:
+                    if event.type == "response.text.delta":
+                        print(event.delta, flush=True, end="")
+                    elif event.type == "response.audio.delta":
+                        
+                        audio_data = base64.b64decode(event.delta)
+                        print(f"Received {len(audio_data)} bytes of audio data.")
+                    elif event.type == "response.audio_transcript.delta":
+                        print(f"Received text delta: {event.delta}")
+                    elif event.type == "response.text.done":
+                        print()
+                    elif event.type == "response.done":
+                        break
+    
+        await credential.close()
     
     asyncio.run(main())
     ```
 
+1. Sign in to Azure with the following command:
+
+    ```shell
+    az login
+    ```
+
 1. Run the Python file.
 
     ```shell
     python text-in-audio-out.py
     ```
 
+1. When prompted for user input, type a message and hit enter to send it to the model. Enter "q" to quit the conversation.
+
 ## [API key](#tab/api-key)
 
 1. Create the `text-in-audio-out.py` file with the following code:
 
     ```python
+    import os
     import base64
     import asyncio
-    from azure.core.credentials import AzureKeyCredential
-    from rtclient import (
-        ResponseCreateMessage,
-        RTLowLevelClient,
-        ResponseCreateParams
-    )
-    
-    # Set environment variables or edit the corresponding values here.
-    api_key = os.environ["AZURE_OPENAI_API_KEY"]    
-    endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
-    deployment = "gpt-4o-mini-realtime-preview"
-    
-    async def text_in_audio_out():
-        async with RTLowLevelClient(
-            url=endpoint,
-            azure_deployment=deployment,
-            key_credential=AzureKeyCredential(api_key) 
-        ) as client:
-            await client.send(
-                ResponseCreateMessage(
-                    response=ResponseCreateParams(
-                        modalities={"audio", "text"}, 
-                        instructions="Please assist the user."
-                    )
-                )
-            )
-            done = False
-            while not done:
-                message = await client.recv()
-                match message.type:
-                    case "response.done":
-                        done = True
-                    case "error":
-                        done = True
-                        print(message.error)
-                    case "response.audio_transcript.delta":
-                        print(f"Received text delta: {message.delta}")
-                    case "response.audio.delta":
-                        buffer = base64.b64decode(message.delta)
-                        print(f"Received {len(buffer)} bytes of audio data.")
-                    case _:
-                        pass
-    
-    async def main():
-        await text_in_audio_out()
+    from openai import AsyncAzureOpenAI
+    from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
     
+    async def main() -> None:
+        """
+        When prompted for user input, type a message and hit enter to send it to the model.
+        Enter "q" to quit the conversation.
+        """
+    
+        client = AsyncAzureOpenAI(
+            azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+            api_key=os.environ["AZURE_OPENAI_API_KEY"],
+            api_version="2024-10-01-preview",
+        )
+        async with client.beta.realtime.connect(
+            model="gpt-4o-realtime-preview",  # deployment name of your model
+        ) as connection:
+            await connection.session.update(session={"modalities": ["text", "audio"]})  
+            while True:
+                user_input = input("Enter a message: ")
+                if user_input == "q":
+                    break
+    
+                await connection.conversation.item.create(
+                    item={
+                        "type": "message",
+                        "role": "user",
+                        "content": [{"type": "input_text", "text": user_input}],
+                    }
+                )
+                await connection.response.create()
+                async for event in connection:
+                    if event.type == "response.text.delta":
+                        print(event.delta, flush=True, end="")
+                    elif event.type == "response.audio.delta":
+                        
+                        audio_data = base64.b64decode(event.delta)
+                        print(f"Received {len(audio_data)} bytes of audio data.")
+                    elif event.type == "response.audio_transcript.delta":
+                        print(f"Received text delta: {event.delta}")
+                    elif event.type == "response.text.done":
+                        print()
+                    elif event.type == "response.done":
+                        break
+        
     asyncio.run(main())
     ```
 
@@ -203,6 +223,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
     python text-in-audio-out.py
     ```
 
+1. When prompted for user input, type a message and hit enter to send it to the model. Enter "q" to quit the conversation.
 ---
 
 Wait a few moments to get the response.
@@ -211,29 +232,27 @@ Wait a few moments to get the response.
 
 The script gets a response from the model and prints the transcript and audio data received.
 
-The output will look similar to the following:
+The output looks similar to the following:
 
 ```console
-Received text delta: Hello
+Enter a message: Please assist the user
+Received text delta: Of
+Received text delta:  course
 Received text delta: !
 Received text delta:  How
 Received 4800 bytes of audio data.
 Received 7200 bytes of audio data.
-Received text delta:  can
 Received 12000 bytes of audio data.
+Received text delta:  can
 Received text delta:  I
 Received text delta:  assist
-Received text delta:  you
 Received 12000 bytes of audio data.
 Received 12000 bytes of audio data.
+Received text delta:  you
 Received text delta:  today
 Received text delta: ?
 Received 12000 bytes of audio data.
-Received 12000 bytes of audio data.
-Received 12000 bytes of audio data.
-Received 12000 bytes of audio data.
-Received 28800 bytes of audio data.
+Received 24000 bytes of audio data.
+Received 36000 bytes of audio data.
+Enter a message: q
 ```
-
-
-