|
4 | 4 | import logging
|
5 | 5 | import mimetypes
|
6 | 6 | import os
|
| 7 | +import time |
7 | 8 | from pathlib import Path
|
8 | 9 | from typing import Any, AsyncGenerator, Dict, Union, cast
|
9 | 10 |
|
| 11 | +from azure.cognitiveservices.speech import ( |
| 12 | + ResultReason, |
| 13 | + SpeechConfig, |
| 14 | + SpeechSynthesisOutputFormat, |
| 15 | + SpeechSynthesisResult, |
| 16 | + SpeechSynthesizer, |
| 17 | +) |
10 | 18 | from azure.core.exceptions import ResourceNotFoundError
|
11 | 19 | from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
|
12 | 20 | from azure.monitor.opentelemetry import configure_azure_monitor
|
|
48 | 56 | CONFIG_BLOB_CONTAINER_CLIENT,
|
49 | 57 | CONFIG_CHAT_APPROACH,
|
50 | 58 | CONFIG_CHAT_VISION_APPROACH,
|
| 59 | + CONFIG_CREDENTIAL, |
51 | 60 | CONFIG_GPT4V_DEPLOYED,
|
52 | 61 | CONFIG_INGESTER,
|
53 | 62 | CONFIG_OPENAI_CLIENT,
|
54 | 63 | CONFIG_SEARCH_CLIENT,
|
55 | 64 | CONFIG_SEMANTIC_RANKER_DEPLOYED,
|
| 65 | + CONFIG_SPEECH_INPUT_ENABLED, |
| 66 | + CONFIG_SPEECH_OUTPUT_ENABLED, |
| 67 | + CONFIG_SPEECH_SERVICE_ID, |
| 68 | + CONFIG_SPEECH_SERVICE_LOCATION, |
| 69 | + CONFIG_SPEECH_SERVICE_TOKEN, |
| 70 | + CONFIG_SPEECH_SERVICE_VOICE, |
56 | 71 | CONFIG_USER_BLOB_CONTAINER_CLIENT,
|
57 | 72 | CONFIG_USER_UPLOAD_ENABLED,
|
58 | 73 | CONFIG_VECTOR_SEARCH_ENABLED,
|
@@ -229,10 +244,56 @@ def config():
|
229 | 244 | "showSemanticRankerOption": current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED],
|
230 | 245 | "showVectorOption": current_app.config[CONFIG_VECTOR_SEARCH_ENABLED],
|
231 | 246 | "showUserUpload": current_app.config[CONFIG_USER_UPLOAD_ENABLED],
|
| 247 | + "showSpeechInput": current_app.config[CONFIG_SPEECH_INPUT_ENABLED], |
| 248 | + "showSpeechOutput": current_app.config[CONFIG_SPEECH_OUTPUT_ENABLED], |
232 | 249 | }
|
233 | 250 | )
|
234 | 251 |
|
235 | 252 |
|
| 253 | +@bp.route("/speech", methods=["POST"]) |
| 254 | +async def speech(): |
| 255 | + if not request.is_json: |
| 256 | + return jsonify({"error": "request must be json"}), 415 |
| 257 | + |
| 258 | + speech_token = current_app.config.get(CONFIG_SPEECH_SERVICE_TOKEN) |
| 259 | + if speech_token is None or speech_token.expires_on < time.time() + 60: |
| 260 | + speech_token = await current_app.config[CONFIG_CREDENTIAL].get_token( |
| 261 | + "https://cognitiveservices.azure.com/.default" |
| 262 | + ) |
| 263 | + current_app.config[CONFIG_SPEECH_SERVICE_TOKEN] = speech_token |
| 264 | + |
| 265 | + request_json = await request.get_json() |
| 266 | + text = request_json["text"] |
| 267 | + try: |
| 268 | + # Construct a token as described in documentation: |
| 269 | + # https://learn.microsoft.com/azure/ai-services/speech-service/how-to-configure-azure-ad-auth?pivots=programming-language-python |
| 270 | + auth_token = ( |
| 271 | + "aad#" |
| 272 | + + current_app.config[CONFIG_SPEECH_SERVICE_ID] |
| 273 | + + "#" |
| 274 | + + current_app.config[CONFIG_SPEECH_SERVICE_TOKEN].token |
| 275 | + ) |
| 276 | + speech_config = SpeechConfig(auth_token=auth_token, region=current_app.config[CONFIG_SPEECH_SERVICE_LOCATION]) |
| 277 | + speech_config.speech_synthesis_voice_name = current_app.config[CONFIG_SPEECH_SERVICE_VOICE] |
| 278 | + speech_config.speech_synthesis_output_format = SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3 |
| 279 | + synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None) |
| 280 | + result: SpeechSynthesisResult = synthesizer.speak_text_async(text).get() |
| 281 | + if result.reason == ResultReason.SynthesizingAudioCompleted: |
| 282 | + return result.audio_data, 200, {"Content-Type": "audio/mp3"} |
| 283 | + elif result.reason == ResultReason.Canceled: |
| 284 | + cancellation_details = result.cancellation_details |
| 285 | + current_app.logger.error( |
| 286 | + "Speech synthesis canceled: %s %s", cancellation_details.reason, cancellation_details.error_details |
| 287 | + ) |
| 288 | + raise Exception("Speech synthesis canceled. Check logs for details.") |
| 289 | + else: |
| 290 | + current_app.logger.error("Unexpected result reason: %s", result.reason) |
| 291 | + raise Exception("Speech synthesis failed. Check logs for details.") |
| 292 | + except Exception as e: |
| 293 | + logging.exception("Exception in /speech") |
| 294 | + return jsonify({"error": str(e)}), 500 |
| 295 | + |
| 296 | + |
236 | 297 | @bp.post("/upload")
|
237 | 298 | @authenticated
|
238 | 299 | async def upload(auth_claims: dict[str, Any]):
|
@@ -337,8 +398,14 @@ async def setup_clients():
|
337 | 398 | AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER", "lexicon")
|
338 | 399 | AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()
|
339 | 400 |
|
| 401 | + AZURE_SPEECH_SERVICE_ID = os.getenv("AZURE_SPEECH_SERVICE_ID") |
| 402 | + AZURE_SPEECH_SERVICE_LOCATION = os.getenv("AZURE_SPEECH_SERVICE_LOCATION") |
| 403 | + AZURE_SPEECH_VOICE = os.getenv("AZURE_SPEECH_VOICE", "en-US-AndrewMultilingualNeural") |
| 404 | + |
340 | 405 | USE_GPT4V = os.getenv("USE_GPT4V", "").lower() == "true"
|
341 | 406 | USE_USER_UPLOAD = os.getenv("USE_USER_UPLOAD", "").lower() == "true"
|
| 407 | + USE_SPEECH_INPUT_BROWSER = os.getenv("USE_SPEECH_INPUT_BROWSER", "").lower() == "true" |
| 408 | + USE_SPEECH_OUTPUT_AZURE = os.getenv("USE_SPEECH_OUTPUT_AZURE", "").lower() == "true" |
342 | 409 |
|
343 | 410 | # Use the current user identity to authenticate with Azure OpenAI, AI Search and Blob Storage (no secrets needed,
|
344 | 411 | # just use 'az login' locally, and managed identity when deployed on Azure). If you need to use keys, use separate AzureKeyCredential instances with the
|
@@ -421,6 +488,18 @@ async def setup_clients():
|
421 | 488 | # Used by the OpenAI SDK
|
422 | 489 | openai_client: AsyncOpenAI
|
423 | 490 |
|
| 491 | + if USE_SPEECH_OUTPUT_AZURE: |
| 492 | + if not AZURE_SPEECH_SERVICE_ID or AZURE_SPEECH_SERVICE_ID == "": |
| 493 | + raise ValueError("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_ID") |
| 494 | + if not AZURE_SPEECH_SERVICE_LOCATION or AZURE_SPEECH_SERVICE_LOCATION == "": |
| 495 | + raise ValueError("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_LOCATION") |
| 496 | + current_app.config[CONFIG_SPEECH_SERVICE_ID] = AZURE_SPEECH_SERVICE_ID |
| 497 | + current_app.config[CONFIG_SPEECH_SERVICE_LOCATION] = AZURE_SPEECH_SERVICE_LOCATION |
| 498 | + current_app.config[CONFIG_SPEECH_SERVICE_VOICE] = AZURE_SPEECH_VOICE |
| 499 | + # Wait until token is needed to fetch for the first time |
| 500 | + current_app.config[CONFIG_SPEECH_SERVICE_TOKEN] = None |
| 501 | + current_app.config[CONFIG_CREDENTIAL] = azure_credential |
| 502 | + |
424 | 503 | if OPENAI_HOST.startswith("azure"):
|
425 | 504 | token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
|
426 | 505 |
|
@@ -456,6 +535,8 @@ async def setup_clients():
|
456 | 535 | current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED] = AZURE_SEARCH_SEMANTIC_RANKER != "disabled"
|
457 | 536 | current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false"
|
458 | 537 | current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD)
|
| 538 | + current_app.config[CONFIG_SPEECH_INPUT_ENABLED] = USE_SPEECH_INPUT_BROWSER |
| 539 | + current_app.config[CONFIG_SPEECH_OUTPUT_ENABLED] = USE_SPEECH_OUTPUT_AZURE |
459 | 540 |
|
460 | 541 | # Various approaches to integrate GPT and external knowledge, most applications will use a single one of these patterns
|
461 | 542 | # or some derivative, here we include several for exploration purposes
|
|
0 commit comments