@@ -62,11 +62,16 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
62
62
> [! TIP]
63
63
> We recommend that you create and activate a new Python environment to use to install the packages you need for this tutorial. Don' t install packages into your global python installation. You should always use a virtual or conda environment when installing python packages, otherwise you can break your global installation of Python.
64
64
65
- 1. Install the real-time audio client library for Python with:
65
+
66
+ 1. Install the OpenAI Python client library with:
66
67
67
68
```console
68
- pip install "https://github.com/Azure-Samples/aoai-realtime-audio-sdk/releases/download/py%2Fv0.5.3/rtclient-0.5.3.tar.gz"
69
+ pip install openai
70
+ pip install openai[realtime]
69
71
```
72
+
73
+ > [!NOTE]
74
+ > This library is maintained by OpenAI. Refer to the [release history](https://github.com/openai/openai-python/releases) to track the latest updates to the library.
70
75
71
76
1. For the **recommended** keyless authentication with Microsoft Entra ID, install the `azure-identity` package with:
72
77
@@ -78,122 +83,138 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
78
83
79
84
[!INCLUDE [resource authentication](resource-authentication.md)]
80
85
86
+ > [!CAUTION]
87
+ > To use the recommended keyless authentication with the SDK, make sure that the `AZURE_OPENAI_API_KEY` environment variable isn' t set.
88
+
81
89
# # Text in audio out
82
90
83
91
# # [Microsoft Entra ID](#tab/keyless)
84
92
85
93
1. Create the ` text-in-audio-out.py` file with the following code:
86
94
87
95
` ` ` python
96
+ import os
88
97
import base64
89
98
import asyncio
90
- from azure.identity.aio import DefaultAzureCredential
91
- from rtclient import (
92
- ResponseCreateMessage,
93
- RTLowLevelClient,
94
- ResponseCreateParams
95
- )
96
-
97
- # Set environment variables or edit the corresponding values here.
98
- endpoint = os.environ["AZURE_OPENAI_ENDPOINT"] or "https://<your-resource-name>.openai.azure.com/"
99
- deployment = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] or "gpt-4o-mini-realtime-preview"
100
-
101
- async def text_in_audio_out():
102
- async with RTLowLevelClient(
103
- url=endpoint,
104
- azure_deployment=deployment,
105
- token_credential=DefaultAzureCredential(),
106
- ) as client:
107
- await client.send(
108
- ResponseCreateMessage(
109
- response=ResponseCreateParams(
110
- modalities={"audio", "text"},
111
- instructions="Please assist the user."
112
- )
99
+ from openai import AsyncAzureOpenAI
100
+ from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
101
+
102
+ async def main () -> None:
103
+ " " "
104
+ When prompted for user input, type a message and hit enter to send it to the model.
105
+ Enter " q" to quit the conversation.
106
+ " " "
107
+
108
+ credential = DefaultAzureCredential ()
109
+ token_provider=get_bearer_token_provider(credential, " https://cognitiveservices.azure.com/.default" )
110
+ client = AsyncAzureOpenAI(
111
+ azure_endpoint=os.environ[" AZURE_OPENAI_ENDPOINT" ],
112
+ azure_ad_token_provider=token_provider,
113
+ api_version=" 2024-10-01-preview" ,
114
+ )
115
+ async with client.beta.realtime.connect(
116
+ model=" gpt-4o-realtime-preview" , # name of your deployment
117
+ ) as connection:
118
+ await connection.session.update(session={" modalities" : [" text" , " audio" ]})
119
+ while True:
120
+ user_input = input(" Enter a message: " )
121
+ if user_input == " q" :
122
+ break
123
+
124
+ await connection.conversation.item.create(
125
+ item={
126
+ " type" : " message" ,
127
+ " role" : " user" ,
128
+ " content" : [{" type" : " input_text" , " text" : user_input}],
129
+ }
113
130
)
114
- )
115
- done = False
116
- while not done:
117
- message = await client.recv()
118
- match message.type:
119
- case "response.done":
120
- done = True
121
- case "error":
122
- done = True
123
- print(message.error)
124
- case "response.audio_transcript.delta":
125
- print(f"Received text delta: {message.delta}")
126
- case "response.audio.delta":
127
- buffer = base64.b64decode(message.delta)
128
- print(f"Received {len(buffer)} bytes of audio data.")
129
- case _:
130
- pass
131
-
132
- async def main():
133
- await text_in_audio_out()
131
+ await connection.response.create ()
132
+ async for event in connection:
133
+ if event.type == " response.text.delta" :
134
+ print(event.delta, flush=True, end=" " )
135
+ elif event.type == " response.audio.delta" :
136
+
137
+ audio_data = base64.b64decode(event.delta)
138
+ print(f" Received {len(audio_data)} bytes of audio data." )
139
+ elif event.type == " response.audio_transcript.delta" :
140
+ print(f" Received text delta: {event.delta}" )
141
+ elif event.type == " response.text.done" :
142
+ print ()
143
+ elif event.type == " response.done" :
144
+ break
145
+
146
+ await credential.close ()
134
147
135
148
asyncio.run(main ())
136
149
` ` `
137
150
151
+ 1. Sign in to Azure with the following command:
152
+
153
+ ` ` ` shell
154
+ az login
155
+ ` ` `
156
+
138
157
1. Run the Python file.
139
158
140
159
` ` ` shell
141
160
python text-in-audio-out.py
142
161
` ` `
143
162
163
+ 1. When prompted for user input, type a message and hit enter to send it to the model. Enter " q" to quit the conversation.
164
+
144
165
# # [API key](#tab/api-key)
145
166
146
167
1. Create the ` text-in-audio-out.py` file with the following code:
147
168
148
169
` ` ` python
170
+ import os
149
171
import base64
150
172
import asyncio
151
- from azure.core.credentials import AzureKeyCredential
152
- from rtclient import (
153
- ResponseCreateMessage,
154
- RTLowLevelClient,
155
- ResponseCreateParams
156
- )
157
-
158
- # Set environment variables or edit the corresponding values here.
159
- api_key = os.environ["AZURE_OPENAI_API_KEY"]
160
- endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
161
- deployment = "gpt-4o-mini-realtime-preview"
162
-
163
- async def text_in_audio_out():
164
- async with RTLowLevelClient(
165
- url=endpoint,
166
- azure_deployment=deployment,
167
- key_credential=AzureKeyCredential(api_key)
168
- ) as client:
169
- await client.send(
170
- ResponseCreateMessage(
171
- response=ResponseCreateParams(
172
- modalities={"audio", "text"},
173
- instructions="Please assist the user."
174
- )
175
- )
176
- )
177
- done = False
178
- while not done:
179
- message = await client.recv()
180
- match message.type:
181
- case "response.done":
182
- done = True
183
- case "error":
184
- done = True
185
- print(message.error)
186
- case "response.audio_transcript.delta":
187
- print(f"Received text delta: {message.delta}")
188
- case "response.audio.delta":
189
- buffer = base64.b64decode(message.delta)
190
- print(f"Received {len(buffer)} bytes of audio data.")
191
- case _:
192
- pass
193
-
194
- async def main():
195
- await text_in_audio_out()
173
+ from openai import AsyncAzureOpenAI
174
+ from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
196
175
176
+ async def main () -> None:
177
+ " " "
178
+ When prompted for user input, type a message and hit enter to send it to the model.
179
+ Enter " q" to quit the conversation.
180
+ " " "
181
+
182
+ client = AsyncAzureOpenAI(
183
+ azure_endpoint=os.environ[" AZURE_OPENAI_ENDPOINT" ],
184
+ api_key=os.environ[" AZURE_OPENAI_API_KEY" ],
185
+ api_version=" 2024-10-01-preview" ,
186
+ )
187
+ async with client.beta.realtime.connect(
188
+ model=" gpt-4o-realtime-preview" , # deployment name of your model
189
+ ) as connection:
190
+ await connection.session.update(session={" modalities" : [" text" , " audio" ]})
191
+ while True:
192
+ user_input = input(" Enter a message: " )
193
+ if user_input == " q" :
194
+ break
195
+
196
+ await connection.conversation.item.create(
197
+ item={
198
+ " type" : " message" ,
199
+ " role" : " user" ,
200
+ " content" : [{" type" : " input_text" , " text" : user_input}],
201
+ }
202
+ )
203
+ await connection.response.create ()
204
+ async for event in connection:
205
+ if event.type == " response.text.delta" :
206
+ print(event.delta, flush=True, end=" " )
207
+ elif event.type == " response.audio.delta" :
208
+
209
+ audio_data = base64.b64decode(event.delta)
210
+ print(f" Received {len(audio_data)} bytes of audio data." )
211
+ elif event.type == " response.audio_transcript.delta" :
212
+ print(f" Received text delta: {event.delta}" )
213
+ elif event.type == " response.text.done" :
214
+ print ()
215
+ elif event.type == " response.done" :
216
+ break
217
+
197
218
asyncio.run(main ())
198
219
` ` `
199
220
@@ -203,6 +224,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
203
224
python text-in-audio-out.py
204
225
` ` `
205
226
227
+ 1. When prompted for user input, type a message and hit enter to send it to the model. Enter " q" to quit the conversation.
206
228
---
207
229
208
230
Wait a few moments to get the response.
@@ -211,29 +233,27 @@ Wait a few moments to get the response.
211
233
212
234
The script gets a response from the model and prints the transcript and audio data received.
213
235
214
- The output will look similar to the following:
236
+ The output looks similar to the following:
215
237
216
238
` ` ` console
217
- Received text delta: Hello
239
+ Enter a message: Please assist the user
240
+ Received text delta: Of
241
+ Received text delta: course
218
242
Received text delta: !
219
243
Received text delta: How
220
244
Received 4800 bytes of audio data.
221
245
Received 7200 bytes of audio data.
222
- Received text delta: can
223
246
Received 12000 bytes of audio data.
247
+ Received text delta: can
224
248
Received text delta: I
225
249
Received text delta: assist
226
- Received text delta: you
227
250
Received 12000 bytes of audio data.
228
251
Received 12000 bytes of audio data.
252
+ Received text delta: you
229
253
Received text delta: today
230
254
Received text delta: ?
231
255
Received 12000 bytes of audio data.
232
- Received 12000 bytes of audio data.
233
- Received 12000 bytes of audio data.
234
- Received 12000 bytes of audio data.
235
- Received 28800 bytes of audio data.
256
+ Received 24000 bytes of audio data.
257
+ Received 36000 bytes of audio data.
258
+ Enter a message: q
236
259
` ` `
237
-
238
-
239
-
0 commit comments