add text-to-speech tests (#33791)

kristapratico · web-flow · commit fc77045c1c77 · 2024-02-23T16:26:50.000-05:00
* add tts tests

* updates for aoai

* update targeted api version
diff --git a/sdk/openai/azure-openai/tests/conftest.py b/sdk/openai/azure-openai/tests/conftest.py
@@ -25,23 +25,27 @@
 ENV_OPENAI_TEST_MODE = "OPENAI_TEST_MODE"
 
 # for pytest.parametrize
-ALL = ["azure", "azuread", "openai"]
 AZURE = "azure"
 OPENAI = "openai"
 AZURE_AD = "azuread"
+ALL = [AZURE, AZURE_AD, OPENAI]
 WHISPER_AZURE = "whisper_azure"
 WHISPER_AZURE_AD = "whisper_azuread"
-WHISPER_ALL = ["whisper_azure", "whisper_azuread", "openai"]
+WHISPER_ALL = [WHISPER_AZURE, WHISPER_AZURE_AD, OPENAI]
+TTS_OPENAI = "tts_openai"
+TTS_AZURE = "tts_azure"
+TTS_AZURE_AD = "tts_azuread"
+TTS_ALL = [TTS_AZURE, TTS_AZURE_AD, TTS_OPENAI]
 DALLE_AZURE = "dalle_azure"
 DALLE_AZURE_AD = "dalle_azuread"
-DALLE_ALL = ["dalle_azure", "dalle_azuread", "openai"]
+DALLE_ALL = [DALLE_AZURE, DALLE_AZURE_AD, OPENAI]
 GPT_4_AZURE = "gpt_4_azure"
 GPT_4_AZURE_AD = "gpt_4_azuread"
 GPT_4_OPENAI = "gpt_4_openai"
-GPT_4_ALL = ["gpt_4_azure", "gpt_4_azuread", "gpt_4_openai"]
+GPT_4_ALL = [GPT_4_AZURE, GPT_4_AZURE_AD, GPT_4_OPENAI]
 ASST_AZURE = "asst_azure"
 ASST_AZUREAD = "asst_azuread"
-ASST_ALL = ["asst_azure", "asst_azuread", "gpt_4_openai"]
+ASST_ALL = [ASST_AZURE, ASST_AZUREAD, GPT_4_OPENAI]
 
 # Environment variable keys
 ENV_AZURE_OPENAI_ENDPOINT = "AZ_OPENAI_ENDPOINT"
@@ -65,6 +69,7 @@
 ENV_AZURE_OPENAI_AUDIO_NAME = "whisper"
 ENV_AZURE_OPENAI_DALLE_NAME = "dall-e-3"
 ENV_AZURE_OPENAI_CHAT_COMPLETIONS_GPT4_NAME = "gpt-4-1106-preview"
+ENV_AZURE_OPENAI_TTS_NAME = "tts"
 
 ENV_OPENAI_KEY = "OPENAI_KEY"
 ENV_OPENAI_COMPLETIONS_MODEL = "gpt-3.5-turbo-instruct"
@@ -73,6 +78,7 @@
 ENV_OPENAI_AUDIO_MODEL = "whisper-1"
 ENV_OPENAI_DALLE_MODEL = "dall-e-3"
 ENV_OPENAI_CHAT_COMPLETIONS_GPT4_MODEL = "gpt-4-1106-preview"
+ENV_OPENAI_TTS_MODEL = "tts-1"
 
 # Fake values
 TEST_ENDPOINT = "https://test-resource.openai.azure.com/"
@@ -140,17 +146,17 @@ def client(api_type):
             azure_ad_token_provider=get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"),
             api_version=ENV_AZURE_OPENAI_API_VERSION,
         )
-    elif api_type == "openai" or api_type == "gpt_4_openai":
+    elif api_type in ["openai", "gpt_4_openai", "tts_openai"]:
         client = openai.OpenAI(
             api_key=os.getenv(ENV_OPENAI_KEY)
         )
-    elif api_type == "whisper_azure":
+    elif api_type in ["whisper_azure", "tts_azure"]:
         client = openai.AzureOpenAI(
             azure_endpoint=os.getenv(ENV_AZURE_OPENAI_NORTHCENTRALUS_ENDPOINT),
             api_key=os.getenv(ENV_AZURE_OPENAI_NORTHCENTRALUS_KEY),
             api_version=ENV_AZURE_OPENAI_API_VERSION,
         )
-    elif api_type == "whisper_azuread":
+    elif api_type in ["whisper_azuread", "tts_azuread"]:
         client = openai.AzureOpenAI(
             azure_endpoint=os.getenv(ENV_AZURE_OPENAI_NORTHCENTRALUS_ENDPOINT),
             azure_ad_token_provider=get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"),
@@ -187,17 +193,17 @@ def client_async(api_type):
             azure_ad_token_provider=get_bearer_token_provider_async(AsyncDefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"),
             api_version=ENV_AZURE_OPENAI_API_VERSION,
         )
-    elif api_type == "openai" or api_type == "gpt_4_openai":
+    elif api_type in ["openai", "gpt_4_openai", "tts_openai"]:
         client = openai.AsyncOpenAI(
             api_key=os.getenv(ENV_OPENAI_KEY)
         )
-    elif api_type == "whisper_azure":
+    elif api_type in ["whisper_azure", "tts_azure"]:
         client = openai.AsyncAzureOpenAI(
             azure_endpoint=os.getenv(ENV_AZURE_OPENAI_NORTHCENTRALUS_ENDPOINT),
             api_key=os.getenv(ENV_AZURE_OPENAI_NORTHCENTRALUS_KEY),
             api_version=ENV_AZURE_OPENAI_API_VERSION,
         )
-    elif api_type == "whisper_azuread":
+    elif api_type in ["whisper_azuread", "tts_azuread"]:
         client = openai.AsyncAzureOpenAI(
             azure_endpoint=os.getenv(ENV_AZURE_OPENAI_NORTHCENTRALUS_ENDPOINT),
             azure_ad_token_provider=get_bearer_token_provider_async(AsyncDefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"),
@@ -225,6 +231,10 @@ def build_kwargs(args, api_type):
             return {"model": ENV_AZURE_OPENAI_AUDIO_NAME}
         elif api_type == "openai":
             return {"model": ENV_OPENAI_AUDIO_MODEL}
+        elif api_type == "tts_openai":
+            return {"model": ENV_OPENAI_TTS_MODEL}
+        elif api_type in ["tts_azure", "tts_azuread"]:
+            return {"model": ENV_AZURE_OPENAI_TTS_NAME}
     if test_feature.startswith("test_chat_completions") \
         or test_feature.startswith(("test_client", "test_models")):
         if api_type in ["azure", "azuread", "asst_azure"]:
diff --git a/sdk/openai/azure-openai/tests/v1_tests/test_audio.py b/sdk/openai/azure-openai/tests/v1_tests/test_audio.py
@@ -5,8 +5,10 @@
 
 import os
 import pytest
+import pathlib
+import uuid
 from devtools_testutils import AzureRecordedTestCase
-from conftest import WHISPER_AZURE, OPENAI, WHISPER_ALL, configure
+from conftest import WHISPER_AZURE, OPENAI, WHISPER_ALL, configure, TTS_OPENAI, TTS_AZURE, TTS_AZURE_AD
 
 audio_test_file = os.path.abspath(os.path.join(os.path.abspath(__file__), "..", "..", "./assets/hello.m4a"))
 audio_long_test_file = os.path.abspath(os.path.join(os.path.abspath(__file__), "..", "..", "./assets/wikipediaOcelot.wav"))
@@ -211,3 +213,71 @@ def test_translate_options(self, client, azure_openai_creds, api_type, **kwargs)
             **kwargs,
         )
         assert result.text == "Hello"
+
+    @configure
+    @pytest.mark.parametrize("api_type", [TTS_OPENAI, TTS_AZURE, TTS_AZURE_AD])
+    def test_tts(self, client, azure_openai_creds, api_type, **kwargs):
+
+        speech_file_path = pathlib.Path(__file__).parent / f"{uuid.uuid4()}.mp3"
+        try:
+            response = client.audio.speech.create(
+                voice="alloy",
+                input="The quick brown fox jumped over the lazy dog.",
+                **kwargs,
+            )
+            assert response.encoding
+            assert response.content
+            assert response.text
+            response.write_to_file(speech_file_path)
+        finally:
+            os.remove(speech_file_path)
+
+    @configure
+    @pytest.mark.parametrize("api_type", [TTS_OPENAI, TTS_AZURE])
+    def test_tts_hd_streaming(self, client, azure_openai_creds, api_type, **kwargs):
+
+        with client.audio.speech.with_streaming_response.create(
+            voice="echo",
+            input="The quick brown fox jumped over the lazy dog.",
+            model="tts-1-hd"
+        ) as response:
+            response.read()
+
+
+    @configure
+    @pytest.mark.parametrize("api_type", [TTS_OPENAI, TTS_AZURE])
+    def test_tts_response_format(self, client, azure_openai_creds, api_type, **kwargs):
+
+        speech_file_path = pathlib.Path(__file__).parent / f"{uuid.uuid4()}.flac"
+        try:
+            response = client.audio.speech.create(
+                voice="fable",
+                input="The quick brown fox jumped over the lazy dog.",
+                response_format="flac",
+                **kwargs
+            )
+            assert response.encoding
+            assert response.content
+            assert response.text
+            response.stream_to_file(speech_file_path)  # deprecated
+        finally:
+            os.remove(speech_file_path)
+
+    @configure
+    @pytest.mark.parametrize("api_type", [TTS_OPENAI, TTS_AZURE])
+    def test_tts_speed(self, client, azure_openai_creds, api_type, **kwargs):
+
+        speech_file_path = pathlib.Path(__file__).parent / f"{uuid.uuid4()}.mp3"
+        try:
+            response = client.audio.speech.create(
+                voice="onyx",
+                input="The quick brown fox jumped over the lazy dog.",
+                speed=3.0,
+                **kwargs
+            )
+            assert response.encoding
+            assert response.content
+            assert response.text
+            response.write_to_file(speech_file_path)
+        finally:
+            os.remove(speech_file_path)
diff --git a/sdk/openai/azure-openai/tests/v1_tests/test_audio_async.py b/sdk/openai/azure-openai/tests/v1_tests/test_audio_async.py
@@ -5,8 +5,10 @@
 
 import os
 import pytest
+import pathlib
+import uuid
 from devtools_testutils import AzureRecordedTestCase
-from conftest import WHISPER_AZURE, OPENAI, WHISPER_ALL, configure_async
+from conftest import WHISPER_AZURE, OPENAI, WHISPER_ALL, configure_async, TTS_OPENAI, TTS_AZURE, TTS_AZURE_AD
 
 audio_test_file = os.path.abspath(os.path.join(os.path.abspath(__file__), "..", "..", "./assets/hello.m4a"))
 audio_long_test_file = os.path.abspath(os.path.join(os.path.abspath(__file__), "..", "..", "./assets/wikipediaOcelot.wav"))
@@ -226,3 +228,74 @@ async def test_translate_options(self, client_async, azure_openai_creds, api_typ
             **kwargs,
         )
         assert result.text == "Hello"
+
+    @configure_async
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("api_type", [TTS_OPENAI, TTS_AZURE, TTS_AZURE_AD])
+    async def test_tts(self, client_async, azure_openai_creds, api_type, **kwargs):
+
+        speech_file_path = pathlib.Path(__file__).parent / f"{uuid.uuid4()}.mp3"
+        try:
+            response = await client_async.audio.speech.create(
+                voice="alloy",
+                input="The quick brown fox jumped over the lazy dog.",
+                **kwargs,
+            )
+            assert response.encoding
+            assert response.content
+            assert response.text
+            response.write_to_file(speech_file_path)
+        finally:
+            os.remove(speech_file_path)
+
+    @configure_async
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("api_type", [TTS_OPENAI, TTS_AZURE])
+    async def test_tts_hd(self, client_async, azure_openai_creds, api_type, **kwargs):
+
+        async with client_async.audio.speech.with_streaming_response.create(
+            voice="echo",
+            input="The quick brown fox jumped over the lazy dog.",
+            model="tts-1-hd"
+        ) as response:
+            await response.read()
+
+    @configure_async
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("api_type", [TTS_OPENAI, TTS_AZURE])
+    async def test_tts_response_format(self, client_async, azure_openai_creds, api_type, **kwargs):
+
+        speech_file_path = pathlib.Path(__file__).parent / f"{uuid.uuid4()}.flac"
+        try:
+            response = await client_async.audio.speech.create(
+                voice="fable",
+                input="The quick brown fox jumped over the lazy dog.",
+                response_format="flac",
+                **kwargs
+            )
+            assert response.encoding
+            assert response.content
+            assert response.text
+            await response.astream_to_file(speech_file_path)  # deprecated
+        finally:
+            os.remove(speech_file_path)
+
+    @configure_async
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("api_type", [TTS_OPENAI, TTS_AZURE])
+    async def test_tts_speed(self, client_async, azure_openai_creds, api_type, **kwargs):
+
+        speech_file_path = pathlib.Path(__file__).parent / f"{uuid.uuid4()}.mp3"
+        try:
+            response = await client_async.audio.speech.create(
+                voice="onyx",
+                input="The quick brown fox jumped over the lazy dog.",
+                speed=3.0,
+                **kwargs
+            )
+            assert response.encoding
+            assert response.content
+            assert response.text
+            response.write_to_file(speech_file_path)
+        finally:
+            os.remove(speech_file_path)