@@ -29,32 +29,32 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
2929 ``` shell
3030 mkdir voice-live-quickstart && cd voice-live-quickstart
3131 ```
32-
32+
33331. Create a virtual environment. If you already have Python 3.10 or higher installed, you can create a virtual environment using the following commands:
34-
34+
3535 # [Windows](#tab/windows)
36-
36+
3737 ` ` ` bash
3838 py -3 -m venv .venv
3939 .venv\s cripts\a ctivate
4040 ` ` `
41-
41+
4242 # [Linux](#tab/linux)
43-
43+
4444 ` ` ` bash
4545 python3 -m venv .venv
4646 source .venv/bin/activate
4747 ` ` `
48-
48+
4949 # [macOS](#tab/macos)
50-
50+
5151 ` ` ` bash
5252 python3 -m venv .venv
5353 source .venv/bin/activate
5454 ` ` `
55-
55+
5656 ---
57-
57+
5858 Activating the Python environment means that when you run ` ` ` python` ` ` or ` ` ` pip` ` ` from the command line, you then use the Python interpreter contained in the ` ` ` .venv` ` ` folder of your application. You can use the ` ` ` deactivate` ` ` command to exit the python virtual environment, and can later reactivate it when needed.
5959
6060 > [! TIP]
@@ -102,7 +102,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
102102
103103 ```python
104104 from __future__ import annotations
105-
105+
106106 import os
107107 import uuid
108108 import json
@@ -112,7 +112,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
112112 import threading
113113 import numpy as np
114114 import sounddevice as sd
115-
115+
116116 from collections import deque
117117 from dotenv import load_dotenv
118118 from azure.identity import DefaultAzureCredential
@@ -125,22 +125,22 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
125125 from websockets.asyncio.client import HeadersLike
126126 from websockets.typing import Data
127127 from websockets.exceptions import WebSocketException
128-
128+
129129 # This is the main function to run the Voice Live API client.
130-
130+
131131 async def main() -> None:
132132 # Set environment variables or edit the corresponding values here.
133133 endpoint = os.environ.get("AZURE_VOICE_LIVE_ENDPOINT") or "https://your-endpoint.azure.com/"
134134 model = os.environ.get("VOICE_LIVE_MODEL") or "gpt-4o"
135135 api_version = os.environ.get("AZURE_VOICE_LIVE_API_VERSION") or "2025-05-01-preview"
136136 api_key = os.environ.get("AZURE_VOICE_LIVE_API_KEY") or "your_api_key"
137-
138- # For the recommended keyless authentication, get and
137+
138+ # For the recommended keyless authentication, get and
139139 # use the Microsoft Entra token instead of api_key:
140140 scopes = "https://cognitiveservices.azure.com/.default"
141141 credential = DefaultAzureCredential()
142142 token = await credential.get_token(scopes)
143-
143+
144144 client = AsyncAzureVoiceLive(
145145 azure_endpoint = endpoint,
146146 api_version = api_version,
@@ -149,7 +149,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
149149 )
150150 async with client.connect(model = model) as connection:
151151 session_update = {
152- "type": "session.update",
152+ "type": "session.update",
153153 "session": {
154154 "turn_detection": {
155155 "type": "azure_semantic_vad",
@@ -179,59 +179,59 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
179179 }
180180 await connection.send(json.dumps(session_update))
181181 print("Session created: ", json.dumps(session_update))
182-
182+
183183 send_task = asyncio.create_task(listen_and_send_audio(connection))
184184 receive_task = asyncio.create_task(receive_audio_and_playback(connection))
185185 keyboard_task = asyncio.create_task(read_keyboard_and_quit())
186-
186+
187187 print("Starting the chat ...")
188188 await asyncio.wait([send_task, receive_task, keyboard_task], return_when=asyncio.FIRST_COMPLETED)
189-
189+
190190 send_task.cancel()
191191 receive_task.cancel()
192192 print("Chat done.")
193-
193+
194194 # --- End of Main Function ---
195-
195+
196196 logger = logging.getLogger(__name__)
197197 AUDIO_SAMPLE_RATE = 24000
198-
198+
199199 class AsyncVoiceLiveConnection:
200200 _connection: AsyncWebsocket
201-
201+
202202 def __init__(self, url: str, additional_headers: HeadersLike) -> None:
203203 self._url = url
204204 self._additional_headers = additional_headers
205205 self._connection = None
206-
206+
207207 async def __aenter__(self) -> AsyncVoiceLiveConnection:
208208 try:
209209 self._connection = await ws_connect(self._url, additional_headers=self._additional_headers)
210210 except WebSocketException as e:
211211 raise ValueError(f"Failed to establish a WebSocket connection: {e}")
212212 return self
213-
213+
214214 async def __aexit__(self, exc_type, exc_value, traceback) -> None:
215215 if self._connection:
216216 await self._connection.close()
217217 self._connection = None
218-
218+
219219 enter = __aenter__
220220 close = __aexit__
221-
221+
222222 async def __aiter__(self) -> AsyncIterator[Data]:
223223 async for data in self._connection:
224224 yield data
225-
225+
226226 async def recv(self) -> Data:
227227 return await self._connection.recv()
228-
228+
229229 async def recv_bytes(self) -> bytes:
230230 return await self._connection.recv()
231-
231+
232232 async def send(self, message: Data) -> None:
233233 await self._connection.send(message)
234-
234+
235235 class AsyncAzureVoiceLive:
236236 def __init__(
237237 self,
@@ -241,32 +241,32 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
241241 token: str | None = None,
242242 api_key: str | None = None,
243243 ) -> None:
244-
244+
245245 self._azure_endpoint = azure_endpoint
246246 self._api_version = api_version
247247 self._token = token
248248 self._api_key = api_key
249249 self._connection = None
250-
250+
251251 def connect(self, model: str) -> AsyncVoiceLiveConnection:
252252 if self._connection is not None:
253253 raise ValueError("Already connected to the Voice Live API.")
254254 if not model:
255255 raise ValueError("Model name is required.")
256-
257- url = f"{self._azure_endpoint.rstrip(' /' )}/voice-agent /realtime?api-version={self._api_version}&model={model}"
256+
257+ url = f"{self._azure_endpoint.rstrip(' /' )}/voice-live /realtime?api-version={self._api_version}&model={model}"
258258 url = url.replace("https://", "wss://")
259-
259+
260260 auth_header = {"Authorization": f"Bearer {self._token}"} if self._token else {"api-key": self._api_key}
261261 request_id = uuid.uuid4()
262262 headers = {"x-ms-client-request-id": str(request_id), **auth_header}
263-
263+
264264 self._connection = AsyncVoiceLiveConnection(
265265 url,
266266 additional_headers=headers,
267267 )
268268 return self._connection
269-
269+
270270 class AudioPlayerAsync:
271271 def __init__(self):
272272 self.queue = deque()
@@ -279,7 +279,7 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
279279 blocksize=2400,
280280 )
281281 self.playing = False
282-
282+
283283 def callback(self, outdata, frames, time, status):
284284 if status:
285285 logger.warning(f"Stream status: {status}")
@@ -294,34 +294,34 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
294294 if len(data) < frames:
295295 data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16)))
296296 outdata[:] = data.reshape(-1, 1)
297-
297+
298298 def add_data(self, data: bytes):
299299 with self.lock:
300300 np_data = np.frombuffer(data, dtype=np.int16)
301301 self.queue.append(np_data)
302302 if not self.playing and len(self.queue) > 10:
303303 self.start()
304-
304+
305305 def start(self):
306306 if not self.playing:
307307 self.playing = True
308308 self.stream.start()
309-
309+
310310 def stop(self):
311311 with self.lock:
312312 self.queue.clear()
313313 self.playing = False
314314 self.stream.stop()
315-
315+
316316 def terminate(self):
317317 with self.lock:
318- self.queue.clear()
318+ self.queue.clear()
319319 self.stream.stop()
320320 self.stream.close()
321-
321+
322322 async def listen_and_send_audio(connection: AsyncVoiceLiveConnection) -> None:
323323 logger.info("Starting audio stream ...")
324-
324+
325325 stream = sd.InputStream(channels=1, samplerate=AUDIO_SAMPLE_RATE, dtype="int16")
326326 try:
327327 stream.start()
@@ -339,56 +339,56 @@ For the recommended keyless authentication with Microsoft Entra ID, you need to:
339339 stream.stop()
340340 stream.close()
341341 logger.info("Audio stream closed.")
342-
342+
343343 async def receive_audio_and_playback(connection: AsyncVoiceLiveConnection) -> None:
344344 last_audio_item_id = None
345345 audio_player = AudioPlayerAsync()
346-
346+
347347 logger.info("Starting audio playback ...")
348348 try:
349349 while True:
350350 async for raw_event in connection:
351351 event = json.loads(raw_event)
352352 print(f"Received event:", {event.get("type")})
353-
353+
354354 if event.get("type") == "session.created":
355355 session = event.get("session")
356356 logger.info(f"Session created: {session.get("id")}")
357-
357+
358358 elif event.get("type") == "response.audio.delta":
359359 if event.get("item_id") != last_audio_item_id:
360360 last_audio_item_id = event.get("item_id")
361-
361+
362362 bytes_data = base64.b64decode(event.get("delta", ""))
363363 audio_player.add_data(bytes_data)
364-
364+
365365 elif event.get("type") == "error":
366366 error_details = event.get("error", {})
367367 error_type = error_details.get("type", "Unknown")
368368 error_code = error_details.get("code", "Unknown")
369369 error_message = error_details.get("message", "No message provided")
370370 raise ValueError(f"Error received: Type={error_type}, Code={error_code}, Message={error_message}")
371-
371+
372372 except Exception as e:
373373 logger.error(f"Error in audio playback: {e}")
374374 finally:
375375 audio_player.terminate()
376376 logger.info("Playback done.")
377-
377+
378378 async def read_keyboard_and_quit() -> None:
379379 print("Press ' q' and Enter to quit the chat.")
380380 while True:
381381 # Run input() in a thread to avoid blocking the event loop
382- user_input = await asyncio.to_thread(input)
382+ user_input = await asyncio.to_thread(input)
383383 if user_input.strip().lower() == ' q' :
384384 print("Quitting the chat...")
385385 break
386-
386+
387387 if __name__ == "__main__":
388388 try:
389389 logging.basicConfig(
390- filename=' voicelive.log' ,
391- filemode="w",
390+ filename=' voicelive.log' ,
391+ filemode="w",
392392 level=logging.DEBUG,
393393 format=' %(asctime)s:%(name)s:%(levelname)s:%(message)s'
394394 )
@@ -453,12 +453,12 @@ Received event: {'response.audio.delta'}
453453Chat done.
454454` ` `
455455
456- The script that you ran creates a log file named ` voicelive.log` in the same directory as the script.
456+ The script that you ran creates a log file named ` voicelive.log` in the same directory as the script.
457457
458458` ` ` python
459459logging.basicConfig(
460- filename=' voicelive.log' ,
461- filemode=" w" ,
460+ filename=' voicelive.log' ,
461+ filemode=" w" ,
462462 level=logging.DEBUG,
463463 format=' %(asctime)s:%(name)s:%(levelname)s:%(message)s'
464464)
@@ -468,7 +468,7 @@ The log file contains information about the connection to the Voice Live API, in
468468
469469` ` ` text
4704702025-05-09 06:56:06,821:websockets.client:DEBUG:= connection is CONNECTING
471- 2025-05-09 06:56:07,101:websockets.client:DEBUG:> GET /voice-agent /realtime? api-version=2025-05-01-preview& model=gpt-4o HTTP/1.1
471+ 2025-05-09 06:56:07,101:websockets.client:DEBUG:> GET /voice-live /realtime? api-version=2025-05-01-preview& model=gpt-4o HTTP/1.1
472472< REDACTED FOR BREVITY>
4734732025-05-09 06:56:07,551:websockets.client:DEBUG:= connection is OPEN
4744742025-05-09 06:56:07,551:websockets.client:DEBUG:< TEXT ' {"event_id":"event_5a7NVdtNBVX9JZVuPc9nYK","typ...es":null,"agent":null}}' [1475 bytes]
0 commit comments