@@ -62,11 +62,15 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
62
62
> [! TIP]
63
63
> We recommend that you create and activate a new Python environment to use to install the packages you need for this tutorial. Don' t install packages into your global python installation. You should always use a virtual or conda environment when installing python packages, otherwise you can break your global installation of Python.
64
64
65
- 1. Install the real-time audio client library for Python with:
65
+
66
+ 1. Install the OpenAI Python client library with:
66
67
67
68
```console
68
- pip install "https://github.com/Azure-Samples/aoai- realtime-audio-sdk/releases/download/py%2Fv0.5.3/rtclient-0.5.3.tar.gz"
69
+ pip install openai[ realtime]
69
70
```
71
+
72
+ > [!NOTE]
73
+ > This library is maintained by OpenAI. Refer to the [release history](https://github.com/openai/openai-python/releases) to track the latest updates to the library.
70
74
71
75
1. For the **recommended** keyless authentication with Microsoft Entra ID, install the `azure-identity` package with:
72
76
@@ -78,122 +82,138 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
78
82
79
83
[!INCLUDE [resource authentication](resource-authentication.md)]
80
84
85
+ > [!CAUTION]
86
+ > To use the recommended keyless authentication with the SDK, make sure that the `AZURE_OPENAI_API_KEY` environment variable isn' t set.
87
+
81
88
# # Text in audio out
82
89
83
90
# # [Microsoft Entra ID](#tab/keyless)
84
91
85
92
1. Create the ` text-in-audio-out.py` file with the following code:
86
93
87
94
` ` ` python
95
+ import os
88
96
import base64
89
97
import asyncio
90
- from azure.identity.aio import DefaultAzureCredential
91
- from rtclient import (
92
- ResponseCreateMessage,
93
- RTLowLevelClient,
94
- ResponseCreateParams
95
- )
96
-
97
- # Set environment variables or edit the corresponding values here.
98
- endpoint = os.environ["AZURE_OPENAI_ENDPOINT"] or "https://<your-resource-name>.openai.azure.com/"
99
- deployment = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] or "gpt-4o-mini-realtime-preview"
100
-
101
- async def text_in_audio_out():
102
- async with RTLowLevelClient(
103
- url=endpoint,
104
- azure_deployment=deployment,
105
- token_credential=DefaultAzureCredential(),
106
- ) as client:
107
- await client.send(
108
- ResponseCreateMessage(
109
- response=ResponseCreateParams(
110
- modalities={"audio", "text"},
111
- instructions="Please assist the user."
112
- )
98
+ from openai import AsyncAzureOpenAI
99
+ from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
100
+
101
+ async def main () -> None:
102
+ " " "
103
+ When prompted for user input, type a message and hit enter to send it to the model.
104
+ Enter " q" to quit the conversation.
105
+ " " "
106
+
107
+ credential = DefaultAzureCredential ()
108
+ token_provider=get_bearer_token_provider(credential, " https://cognitiveservices.azure.com/.default" )
109
+ client = AsyncAzureOpenAI(
110
+ azure_endpoint=os.environ[" AZURE_OPENAI_ENDPOINT" ],
111
+ azure_ad_token_provider=token_provider,
112
+ api_version=" 2024-10-01-preview" ,
113
+ )
114
+ async with client.beta.realtime.connect(
115
+ model=" gpt-4o-realtime-preview" , # name of your deployment
116
+ ) as connection:
117
+ await connection.session.update(session={" modalities" : [" text" , " audio" ]})
118
+ while True:
119
+ user_input = input(" Enter a message: " )
120
+ if user_input == " q" :
121
+ break
122
+
123
+ await connection.conversation.item.create(
124
+ item={
125
+ " type" : " message" ,
126
+ " role" : " user" ,
127
+ " content" : [{" type" : " input_text" , " text" : user_input}],
128
+ }
113
129
)
114
- )
115
- done = False
116
- while not done:
117
- message = await client.recv()
118
- match message.type:
119
- case "response.done":
120
- done = True
121
- case "error":
122
- done = True
123
- print(message.error)
124
- case "response.audio_transcript.delta":
125
- print(f"Received text delta: {message.delta}")
126
- case "response.audio.delta":
127
- buffer = base64.b64decode(message.delta)
128
- print(f"Received {len(buffer)} bytes of audio data.")
129
- case _:
130
- pass
131
-
132
- async def main():
133
- await text_in_audio_out()
130
+ await connection.response.create ()
131
+ async for event in connection:
132
+ if event.type == " response.text.delta" :
133
+ print(event.delta, flush=True, end=" " )
134
+ elif event.type == " response.audio.delta" :
135
+
136
+ audio_data = base64.b64decode(event.delta)
137
+ print(f" Received {len(audio_data)} bytes of audio data." )
138
+ elif event.type == " response.audio_transcript.delta" :
139
+ print(f" Received text delta: {event.delta}" )
140
+ elif event.type == " response.text.done" :
141
+ print ()
142
+ elif event.type == " response.done" :
143
+ break
144
+
145
+ await credential.close ()
134
146
135
147
asyncio.run(main ())
136
148
` ` `
137
149
150
+ 1. Sign in to Azure with the following command:
151
+
152
+ ` ` ` shell
153
+ az login
154
+ ` ` `
155
+
138
156
1. Run the Python file.
139
157
140
158
` ` ` shell
141
159
python text-in-audio-out.py
142
160
` ` `
143
161
162
+ 1. When prompted for user input, type a message and hit enter to send it to the model. Enter " q" to quit the conversation.
163
+
144
164
# # [API key](#tab/api-key)
145
165
146
166
1. Create the ` text-in-audio-out.py` file with the following code:
147
167
148
168
` ` ` python
169
+ import os
149
170
import base64
150
171
import asyncio
151
- from azure.core.credentials import AzureKeyCredential
152
- from rtclient import (
153
- ResponseCreateMessage,
154
- RTLowLevelClient,
155
- ResponseCreateParams
156
- )
157
-
158
- # Set environment variables or edit the corresponding values here.
159
- api_key = os.environ["AZURE_OPENAI_API_KEY"]
160
- endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
161
- deployment = "gpt-4o-mini-realtime-preview"
162
-
163
- async def text_in_audio_out():
164
- async with RTLowLevelClient(
165
- url=endpoint,
166
- azure_deployment=deployment,
167
- key_credential=AzureKeyCredential(api_key)
168
- ) as client:
169
- await client.send(
170
- ResponseCreateMessage(
171
- response=ResponseCreateParams(
172
- modalities={"audio", "text"},
173
- instructions="Please assist the user."
174
- )
175
- )
176
- )
177
- done = False
178
- while not done:
179
- message = await client.recv()
180
- match message.type:
181
- case "response.done":
182
- done = True
183
- case "error":
184
- done = True
185
- print(message.error)
186
- case "response.audio_transcript.delta":
187
- print(f"Received text delta: {message.delta}")
188
- case "response.audio.delta":
189
- buffer = base64.b64decode(message.delta)
190
- print(f"Received {len(buffer)} bytes of audio data.")
191
- case _:
192
- pass
193
-
194
- async def main():
195
- await text_in_audio_out()
172
+ from openai import AsyncAzureOpenAI
173
+ from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
196
174
175
+ async def main () -> None:
176
+ " " "
177
+ When prompted for user input, type a message and hit enter to send it to the model.
178
+ Enter " q" to quit the conversation.
179
+ " " "
180
+
181
+ client = AsyncAzureOpenAI(
182
+ azure_endpoint=os.environ[" AZURE_OPENAI_ENDPOINT" ],
183
+ api_key=os.environ[" AZURE_OPENAI_API_KEY" ],
184
+ api_version=" 2024-10-01-preview" ,
185
+ )
186
+ async with client.beta.realtime.connect(
187
+ model=" gpt-4o-realtime-preview" , # deployment name of your model
188
+ ) as connection:
189
+ await connection.session.update(session={" modalities" : [" text" , " audio" ]})
190
+ while True:
191
+ user_input = input(" Enter a message: " )
192
+ if user_input == " q" :
193
+ break
194
+
195
+ await connection.conversation.item.create(
196
+ item={
197
+ " type" : " message" ,
198
+ " role" : " user" ,
199
+ " content" : [{" type" : " input_text" , " text" : user_input}],
200
+ }
201
+ )
202
+ await connection.response.create ()
203
+ async for event in connection:
204
+ if event.type == " response.text.delta" :
205
+ print(event.delta, flush=True, end=" " )
206
+ elif event.type == " response.audio.delta" :
207
+
208
+ audio_data = base64.b64decode(event.delta)
209
+ print(f" Received {len(audio_data)} bytes of audio data." )
210
+ elif event.type == " response.audio_transcript.delta" :
211
+ print(f" Received text delta: {event.delta}" )
212
+ elif event.type == " response.text.done" :
213
+ print ()
214
+ elif event.type == " response.done" :
215
+ break
216
+
197
217
asyncio.run(main ())
198
218
` ` `
199
219
@@ -203,6 +223,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
203
223
python text-in-audio-out.py
204
224
` ` `
205
225
226
+ 1. When prompted for user input, type a message and hit enter to send it to the model. Enter " q" to quit the conversation.
206
227
---
207
228
208
229
Wait a few moments to get the response.
@@ -211,29 +232,27 @@ Wait a few moments to get the response.
211
232
212
233
The script gets a response from the model and prints the transcript and audio data received.
213
234
214
- The output will look similar to the following:
235
+ The output looks similar to the following:
215
236
216
237
` ` ` console
217
- Received text delta: Hello
238
+ Enter a message: Please assist the user
239
+ Received text delta: Of
240
+ Received text delta: course
218
241
Received text delta: !
219
242
Received text delta: How
220
243
Received 4800 bytes of audio data.
221
244
Received 7200 bytes of audio data.
222
- Received text delta: can
223
245
Received 12000 bytes of audio data.
246
+ Received text delta: can
224
247
Received text delta: I
225
248
Received text delta: assist
226
- Received text delta: you
227
249
Received 12000 bytes of audio data.
228
250
Received 12000 bytes of audio data.
251
+ Received text delta: you
229
252
Received text delta: today
230
253
Received text delta: ?
231
254
Received 12000 bytes of audio data.
232
- Received 12000 bytes of audio data.
233
- Received 12000 bytes of audio data.
234
- Received 12000 bytes of audio data.
235
- Received 28800 bytes of audio data.
255
+ Received 24000 bytes of audio data.
256
+ Received 36000 bytes of audio data.
257
+ Enter a message: q
236
258
` ` `
237
-
238
-
239
-
0 commit comments