@@ -29,32 +29,32 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
29
29
``` shell
30
30
mkdir voice-live-quickstart && cd voice-live-quickstart
31
31
```
32
-
32
+
33
33
1. Create a virtual environment. If you already have Python 3.10 or higher installed, you can create a virtual environment using the following commands:
34
-
34
+
35
35
# [Windows](#tab/windows)
36
-
36
+
37
37
` ` ` bash
38
38
py -3 -m venv .venv
39
39
.venv\s cripts\a ctivate
40
40
` ` `
41
-
41
+
42
42
# [Linux](#tab/linux)
43
-
43
+
44
44
` ` ` bash
45
45
python3 -m venv .venv
46
46
source .venv/bin/activate
47
47
` ` `
48
-
48
+
49
49
# [macOS](#tab/macos)
50
-
50
+
51
51
` ` ` bash
52
52
python3 -m venv .venv
53
53
source .venv/bin/activate
54
54
` ` `
55
-
55
+
56
56
---
57
-
57
+
58
58
Activating the Python environment means that when you run ` ` ` python` ` ` or ` ` ` pip` ` ` from the command line, you then use the Python interpreter contained in the ` ` ` .venv` ` ` folder of your application. You can use the ` ` ` deactivate` ` ` command to exit the python virtual environment, and can later reactivate it when needed.
59
59
60
60
> [! TIP]
@@ -102,7 +102,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
102
102
103
103
```python
104
104
from __future__ import annotations
105
-
105
+
106
106
import os
107
107
import uuid
108
108
import json
@@ -112,7 +112,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
112
112
import threading
113
113
import numpy as np
114
114
import sounddevice as sd
115
-
115
+
116
116
from collections import deque
117
117
from dotenv import load_dotenv
118
118
from azure.identity import DefaultAzureCredential
@@ -125,22 +125,22 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
125
125
from websockets.asyncio.client import HeadersLike
126
126
from websockets.typing import Data
127
127
from websockets.exceptions import WebSocketException
128
-
128
+
129
129
# This is the main function to run the Voice Live API client.
130
-
130
+
131
131
async def main() -> None:
132
132
# Set environment variables or edit the corresponding values here.
133
133
endpoint = os.environ.get("AZURE_VOICE_LIVE_ENDPOINT") or "https://your-endpoint.azure.com/"
134
134
model = os.environ.get("VOICE_LIVE_MODEL") or "gpt-4o"
135
135
api_version = os.environ.get("AZURE_VOICE_LIVE_API_VERSION") or "2025-05-01-preview"
136
136
api_key = os.environ.get("AZURE_VOICE_LIVE_API_KEY") or "your_api_key"
137
-
138
- # For the recommended keyless authentication, get and
137
+
138
+ # For the recommended keyless authentication, get and
139
139
# use the Microsoft Entra token instead of api_key:
140
140
scopes = "https://cognitiveservices.azure.com/.default"
141
141
credential = DefaultAzureCredential()
142
142
token = await credential.get_token(scopes)
143
-
143
+
144
144
client = AsyncAzureVoiceLive(
145
145
azure_endpoint = endpoint,
146
146
api_version = api_version,
@@ -149,7 +149,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
149
149
)
150
150
async with client.connect(model = model) as connection:
151
151
session_update = {
152
- "type": "session.update",
152
+ "type": "session.update",
153
153
"session": {
154
154
"turn_detection": {
155
155
"type": "azure_semantic_vad",
@@ -179,59 +179,59 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
179
179
}
180
180
await connection.send(json.dumps(session_update))
181
181
print("Session created: ", json.dumps(session_update))
182
-
182
+
183
183
send_task = asyncio.create_task(listen_and_send_audio(connection))
184
184
receive_task = asyncio.create_task(receive_audio_and_playback(connection))
185
185
keyboard_task = asyncio.create_task(read_keyboard_and_quit())
186
-
186
+
187
187
print("Starting the chat ...")
188
188
await asyncio.wait([send_task, receive_task, keyboard_task], return_when=asyncio.FIRST_COMPLETED)
189
-
189
+
190
190
send_task.cancel()
191
191
receive_task.cancel()
192
192
print("Chat done.")
193
-
193
+
194
194
# --- End of Main Function ---
195
-
195
+
196
196
logger = logging.getLogger(__name__)
197
197
AUDIO_SAMPLE_RATE = 24000
198
-
198
+
199
199
class AsyncVoiceLiveConnection:
200
200
_connection: AsyncWebsocket
201
-
201
+
202
202
def __init__(self, url: str, additional_headers: HeadersLike) -> None:
203
203
self._url = url
204
204
self._additional_headers = additional_headers
205
205
self._connection = None
206
-
206
+
207
207
async def __aenter__(self) -> AsyncVoiceLiveConnection:
208
208
try:
209
209
self._connection = await ws_connect(self._url, additional_headers=self._additional_headers)
210
210
except WebSocketException as e:
211
211
raise ValueError(f"Failed to establish a WebSocket connection: {e}")
212
212
return self
213
-
213
+
214
214
async def __aexit__(self, exc_type, exc_value, traceback) -> None:
215
215
if self._connection:
216
216
await self._connection.close()
217
217
self._connection = None
218
-
218
+
219
219
enter = __aenter__
220
220
close = __aexit__
221
-
221
+
222
222
async def __aiter__(self) -> AsyncIterator[Data]:
223
223
async for data in self._connection:
224
224
yield data
225
-
225
+
226
226
async def recv(self) -> Data:
227
227
return await self._connection.recv()
228
-
228
+
229
229
async def recv_bytes(self) -> bytes:
230
230
return await self._connection.recv()
231
-
231
+
232
232
async def send(self, message: Data) -> None:
233
233
await self._connection.send(message)
234
-
234
+
235
235
class AsyncAzureVoiceLive:
236
236
def __init__(
237
237
self,
@@ -241,32 +241,32 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
241
241
token: str | None = None,
242
242
api_key: str | None = None,
243
243
) -> None:
244
-
244
+
245
245
self._azure_endpoint = azure_endpoint
246
246
self._api_version = api_version
247
247
self._token = token
248
248
self._api_key = api_key
249
249
self._connection = None
250
-
250
+
251
251
def connect(self, model: str) -> AsyncVoiceLiveConnection:
252
252
if self._connection is not None:
253
253
raise ValueError("Already connected to the Voice Live API.")
254
254
if not model:
255
255
raise ValueError("Model name is required.")
256
-
257
- url = f"{self._azure_endpoint.rstrip(' /' )}/voice-agent /realtime?api-version={self._api_version}&model={model}"
256
+
257
+ url = f"{self._azure_endpoint.rstrip(' /' )}/voice-live /realtime?api-version={self._api_version}&model={model}"
258
258
url = url.replace("https://", "wss://")
259
-
259
+
260
260
auth_header = {"Authorization": f"Bearer {self._token}"} if self._token else {"api-key": self._api_key}
261
261
request_id = uuid.uuid4()
262
262
headers = {"x-ms-client-request-id": str(request_id), **auth_header}
263
-
263
+
264
264
self._connection = AsyncVoiceLiveConnection(
265
265
url,
266
266
additional_headers=headers,
267
267
)
268
268
return self._connection
269
-
269
+
270
270
class AudioPlayerAsync:
271
271
def __init__(self):
272
272
self.queue = deque()
@@ -279,7 +279,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
279
279
blocksize=2400,
280
280
)
281
281
self.playing = False
282
-
282
+
283
283
def callback(self, outdata, frames, time, status):
284
284
if status:
285
285
logger.warning(f"Stream status: {status}")
@@ -294,34 +294,34 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
294
294
if len(data) < frames:
295
295
data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16)))
296
296
outdata[:] = data.reshape(-1, 1)
297
-
297
+
298
298
def add_data(self, data: bytes):
299
299
with self.lock:
300
300
np_data = np.frombuffer(data, dtype=np.int16)
301
301
self.queue.append(np_data)
302
302
if not self.playing and len(self.queue) > 10:
303
303
self.start()
304
-
304
+
305
305
def start(self):
306
306
if not self.playing:
307
307
self.playing = True
308
308
self.stream.start()
309
-
309
+
310
310
def stop(self):
311
311
with self.lock:
312
312
self.queue.clear()
313
313
self.playing = False
314
314
self.stream.stop()
315
-
315
+
316
316
def terminate(self):
317
317
with self.lock:
318
- self.queue.clear()
318
+ self.queue.clear()
319
319
self.stream.stop()
320
320
self.stream.close()
321
-
321
+
322
322
async def listen_and_send_audio(connection: AsyncVoiceLiveConnection) -> None:
323
323
logger.info("Starting audio stream ...")
324
-
324
+
325
325
stream = sd.InputStream(channels=1, samplerate=AUDIO_SAMPLE_RATE, dtype="int16")
326
326
try:
327
327
stream.start()
@@ -339,56 +339,56 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
339
339
stream.stop()
340
340
stream.close()
341
341
logger.info("Audio stream closed.")
342
-
342
+
343
343
async def receive_audio_and_playback(connection: AsyncVoiceLiveConnection) -> None:
344
344
last_audio_item_id = None
345
345
audio_player = AudioPlayerAsync()
346
-
346
+
347
347
logger.info("Starting audio playback ...")
348
348
try:
349
349
while True:
350
350
async for raw_event in connection:
351
351
event = json.loads(raw_event)
352
352
print(f"Received event:", {event.get("type")})
353
-
353
+
354
354
if event.get("type") == "session.created":
355
355
session = event.get("session")
356
356
logger.info(f"Session created: {session.get("id")}")
357
-
357
+
358
358
elif event.get("type") == "response.audio.delta":
359
359
if event.get("item_id") != last_audio_item_id:
360
360
last_audio_item_id = event.get("item_id")
361
-
361
+
362
362
bytes_data = base64.b64decode(event.get("delta", ""))
363
363
audio_player.add_data(bytes_data)
364
-
364
+
365
365
elif event.get("type") == "error":
366
366
error_details = event.get("error", {})
367
367
error_type = error_details.get("type", "Unknown")
368
368
error_code = error_details.get("code", "Unknown")
369
369
error_message = error_details.get("message", "No message provided")
370
370
raise ValueError(f"Error received: Type={error_type}, Code={error_code}, Message={error_message}")
371
-
371
+
372
372
except Exception as e:
373
373
logger.error(f"Error in audio playback: {e}")
374
374
finally:
375
375
audio_player.terminate()
376
376
logger.info("Playback done.")
377
-
377
+
378
378
async def read_keyboard_and_quit() -> None:
379
379
print("Press ' q' and Enter to quit the chat.")
380
380
while True:
381
381
# Run input() in a thread to avoid blocking the event loop
382
- user_input = await asyncio.to_thread(input)
382
+ user_input = await asyncio.to_thread(input)
383
383
if user_input.strip().lower() == ' q' :
384
384
print("Quitting the chat...")
385
385
break
386
-
386
+
387
387
if __name__ == "__main__":
388
388
try:
389
389
logging.basicConfig(
390
- filename=' voicelive.log' ,
391
- filemode="w",
390
+ filename=' voicelive.log' ,
391
+ filemode="w",
392
392
level=logging.DEBUG,
393
393
format=' %(asctime)s:%(name)s:%(levelname)s:%(message)s'
394
394
)
@@ -453,12 +453,12 @@ Received event: {'response.audio.delta'}
453
453
Chat done.
454
454
` ` `
455
455
456
- The script that you ran creates a log file named ` voicelive.log` in the same directory as the script.
456
+ The script that you ran creates a log file named ` voicelive.log` in the same directory as the script.
457
457
458
458
` ` ` python
459
459
logging.basicConfig(
460
- filename=' voicelive.log' ,
461
- filemode=" w" ,
460
+ filename=' voicelive.log' ,
461
+ filemode=" w" ,
462
462
level=logging.DEBUG,
463
463
format=' %(asctime)s:%(name)s:%(levelname)s:%(message)s'
464
464
)
@@ -468,7 +468,7 @@ The log file contains information about the connection to the Voice Live API, in
468
468
469
469
` ` ` text
470
470
2025-05-09 06:56:06,821:websockets.client:DEBUG:= connection is CONNECTING
471
- 2025-05-09 06:56:07,101:websockets.client:DEBUG:> GET /voice-agent /realtime? api-version=2025-05-01-preview& model=gpt-4o HTTP/1.1
471
+ 2025-05-09 06:56:07,101:websockets.client:DEBUG:> GET /voice-live /realtime? api-version=2025-05-01-preview& model=gpt-4o HTTP/1.1
472
472
< REDACTED FOR BREVITY>
473
473
2025-05-09 06:56:07,551:websockets.client:DEBUG:= connection is OPEN
474
474
2025-05-09 06:56:07,551:websockets.client:DEBUG:< TEXT ' {"event_id":"event_5a7NVdtNBVX9JZVuPc9nYK","typ...es":null,"agent":null}}' [1475 bytes]
0 commit comments