make to receive voice option as a string

dev-jonghoonpark · dev-jonghoonpark · commit edca997ce4be · 2025-03-31T18:57:24.000+09:00
Signed-off-by: jonghoon park &lt;dev@jonghoonpark.com&gt;
diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java
@@ -29,6 +29,7 @@
  *
  * @author Ahmed Yousri
  * @author Stefan Vassilev
+ * @author Jonghoon Park
  */
 @ConfigurationProperties(OpenAiAudioSpeechProperties.CONFIG_PREFIX)
 public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
@@ -39,7 +40,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
 
 	private static final Float SPEED = 1.0f;
 
-	private static final OpenAiAudioApi.SpeechRequest.Voice VOICE = OpenAiAudioApi.SpeechRequest.Voice.ALLOY;
+	private static final String VOICE = OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue();
 
 	private static final OpenAiAudioApi.SpeechRequest.AudioResponseFormat DEFAULT_RESPONSE_FORMAT = OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3;
 
diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/test/java/org/springframework/ai/model/openai/autoconfigure/OpenAiPropertiesTests.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/test/java/org/springframework/ai/model/openai/autoconfigure/OpenAiPropertiesTests.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@
  *
  * @author Christian Tzolov
  * @author Thomas Vitale
+ * @author Jonghoon Park
  * @since 0.8.0
  */
 public class OpenAiPropertiesTests {
@@ -177,7 +178,7 @@ public void speechProperties() {
 
 				assertThat(speechProperties.getOptions().getModel()).isEqualTo("TTS_1");
 				assertThat(speechProperties.getOptions().getVoice())
-					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ALLOY);
+					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue());
 				assertThat(speechProperties.getOptions().getResponseFormat())
 					.isEqualTo(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3);
 				assertThat(speechProperties.getOptions().getSpeed()).isEqualTo(0.75f);
@@ -205,7 +206,7 @@ public void speechPropertiesTest() {
 
 				assertThat(speechProperties.getOptions().getModel()).isEqualTo("TTS_1");
 				assertThat(speechProperties.getOptions().getVoice())
-					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ALLOY);
+					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue());
 				assertThat(speechProperties.getOptions().getResponseFormat())
 					.isEqualTo(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3);
 				assertThat(speechProperties.getOptions().getSpeed()).isEqualTo(0.75f);
@@ -237,7 +238,8 @@ public void speechOverrideConnectionPropertiesTest() {
 				assertThat(speechProperties.getBaseUrl()).isEqualTo("TEST_BASE_URL2");
 
 				assertThat(speechProperties.getOptions().getModel()).isEqualTo("TTS_2");
-				assertThat(speechProperties.getOptions().getVoice()).isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ECHO);
+				assertThat(speechProperties.getOptions().getVoice())
+					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ECHO.getValue());
 				assertThat(speechProperties.getOptions().getResponseFormat())
 					.isEqualTo(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.OPUS);
 				assertThat(speechProperties.getOptions().getSpeed()).isEqualTo(0.5f);
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,6 +42,7 @@
  * @author Ahmed Yousri
  * @author Hyunjoon Choi
  * @author Thomas Vitale
+ * @author Jonghoon Park
  * @see OpenAiAudioApi
  * @since 1.0.0-M1
  */
@@ -81,7 +82,7 @@ public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) {
 				OpenAiAudioSpeechOptions.builder()
 					.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
 					.responseFormat(AudioResponseFormat.MP3)
-					.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+					.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 					.speed(SPEED)
 					.build());
 	}
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,14 +29,15 @@
  * @author Ahmed Yousri
  * @author Hyunjoon Choi
  * @author Ilayaperumal Gopinathan
+ * @author Jonghoon Park
  * @since 1.0.0-M1
  */
 @JsonInclude(JsonInclude.Include.NON_NULL)
 public class OpenAiAudioSpeechOptions implements ModelOptions {
 
 	/**
-	 * ID of the model to use for generating the audio. One of the available TTS models:
-	 * tts-1 or tts-1-hd.
+	 * ID of the model to use for generating the audio. For OpenAI's TTS API, use one of
+	 * the available models: tts-1 or tts-1-hd.
 	 */
 	@JsonProperty("model")
 	private String model;
@@ -48,11 +49,11 @@ public class OpenAiAudioSpeechOptions implements ModelOptions {
 	private String input;
 
 	/**
-	 * The voice to use for synthesis. One of the available voices for the chosen model:
-	 * 'alloy', 'echo', 'fable', 'onyx', 'nova', and 'shimmer'.
+	 * The voice to use for synthesis. For OpenAI's TTS API, One of the available voices
+	 * for the chosen model: 'alloy', 'echo', 'fable', 'onyx', 'nova', and 'shimmer'.
 	 */
 	@JsonProperty("voice")
-	private Voice voice;
+	private String voice;
 
 	/**
 	 * The format of the audio output. Supported formats are mp3, opus, aac, and flac.
@@ -88,11 +89,11 @@ public void setInput(String input) {
 		this.input = input;
 	}
 
-	public Voice getVoice() {
+	public String getVoice() {
 		return this.voice;
 	}
 
-	public void setVoice(Voice voice) {
+	public void setVoice(String voice) {
 		this.voice = voice;
 	}
 
@@ -197,7 +198,7 @@ public Builder input(String input) {
 			return this;
 		}
 
-		public Builder voice(Voice voice) {
+		public Builder voice(String voice) {
 			this.options.voice = voice;
 			return this;
 		}
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,6 +47,7 @@
  *
  * @author Christian Tzolov
  * @author Ilayaperumal Gopinathan
+ * @author Jonghoon Park
  * @since 0.8.1
  */
 public class OpenAiAudioApi {
@@ -330,7 +331,7 @@ public record SpeechRequest(
 	// @formatter:off
 		@JsonProperty("model") String model,
 		@JsonProperty("input") String input,
-		@JsonProperty("voice") Voice voice,
+		@JsonProperty("voice") String voice,
 		@JsonProperty("response_format") AudioResponseFormat responseFormat,
 		@JsonProperty("speed") Float speed) {
 		// @formatter:on
@@ -415,7 +416,7 @@ public static class Builder {
 
 			private String input;
 
-			private Voice voice;
+			private String voice;
 
 			private AudioResponseFormat responseFormat = AudioResponseFormat.MP3;
 
@@ -431,7 +432,7 @@ public Builder input(String input) {
 				return this;
 			}
 
-			public Builder voice(Voice voice) {
+			public Builder voice(String voice) {
 				this.voice = voice;
 				return this;
 			}
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,6 +37,7 @@
 
 /**
  * @author Christian Tzolov
+ * @author Jonghoon Park
  */
 @EnabledIfEnvironmentVariable(named = "OPENAI_API_KEY", matches = ".+")
 public class OpenAiAudioApiIT {
@@ -53,7 +54,7 @@ void speechTranscriptionAndTranslation() throws IOException {
 			.createSpeech(SpeechRequest.builder()
 				.model(TtsModel.TTS_1_HD.getValue())
 				.input("Hello, my name is Chris and I love Spring A.I.")
-				.voice(Voice.ONYX)
+				.voice(Voice.ONYX.getValue())
 				.build())
 			.getBody();
 
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java
@@ -31,6 +31,7 @@
 
 /**
  * @author Ilayaperumal Gopinathan
+ * @author Jonghoon Park
  */
 @SpringBootTest(classes = OpenAiAudioModelNoOpApiKeysIT.Config.class)
 @EnabledIfEnvironmentVariable(named = "OPENAI_API_KEY", matches = ".+")
@@ -46,7 +47,7 @@ void checkNoOpKey() {
 				.createSpeech(OpenAiAudioApi.SpeechRequest.builder()
 					.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
 					.input("Hello, my name is Chris and I love Spring A.I.")
-					.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX)
+					.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX.getValue())
 					.build())
 				.getBody();
 		}).isInstanceOf(NonTransientAiException.class);
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,10 @@
 
 import static org.assertj.core.api.Assertions.assertThat;
 
+/**
+ * @author Ahmed Yousri
+ * @author Jonghoon Park
+ */
 @SpringBootTest(classes = OpenAiTestConfiguration.class)
 @EnabledIfEnvironmentVariable(named = "OPENAI_API_KEY", matches = ".+")
 class OpenAiSpeechModelIT extends AbstractIT {
@@ -57,7 +61,7 @@ void shouldProduceAudioBytesDirectlyFromMessage() {
 	@Test
 	void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() {
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
@@ -75,7 +79,7 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() {
 	@Test
 	void speechRateLimitTest() {
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
@@ -95,7 +99,7 @@ void speechRateLimitTest() {
 	void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
 
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
@@ -117,7 +121,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
 	@ValueSource(strings = { "alloy", "echo", "fable", "onyx", "nova", "shimmer", "sage", "coral", "ash" })
 	void speechVoicesTest(String voice) {
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.valueOf(voice.toUpperCase()))
+			.voice(voice)
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java
@@ -46,6 +46,7 @@
 
 /**
  * @author Ahmed Yousri
+ * @author Jonghoon Park
  */
 @RestClientTest(OpenAiSpeechModelWithSpeechResponseMetadataTests.Config.class)
 public class OpenAiSpeechModelWithSpeechResponseMetadataTests {
@@ -71,7 +72,7 @@ void aiResponseContainsImageResponseMetadata() {
 		prepareMock();
 
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc
@@ -85,8 +85,8 @@ The prefix `spring.ai.openai.audio.speech` is used as the property prefix that l
 | spring.ai.openai.audio.speech.api-key    | The API Key           |  -
 | spring.ai.openai.audio.speech.organization-id | Optionally you can specify which organization  used for an API request. |  -
 | spring.ai.openai.audio.speech.project-id      | Optionally, you can specify which project is used for an API request. |  -
-| spring.ai.openai.audio.speech.options.model  | ID of the model to use. Only tts-1 is currently available. |  tts-1
-| spring.ai.openai.audio.speech.options.voice | The voice to use for the TTS output. Available options are: alloy, echo, fable, onyx, nova, and shimmer. | alloy
+| spring.ai.openai.audio.speech.options.model  | ID of the model to use for generating the audio. For OpenAI's TTS API, use one of the available models: tts-1 or tts-1-hd. |  tts-1
+| spring.ai.openai.audio.speech.options.voice | The voice to use for synthesis. For OpenAI's TTS API, One of the available voices for the chosen model: alloy, echo, fable, onyx, nova, and shimmer. | alloy
 | spring.ai.openai.audio.speech.options.response-format | The format of the audio output. Supported formats are mp3, opus, aac, flac, wav, and pcm. | mp3
 | spring.ai.openai.audio.speech.options.speed | The speed of the voice synthesis. The acceptable range is from 0.25 (slowest) to 4.0 (fastest). | 1.0
 |====
@@ -113,8 +113,8 @@ OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
     .speed(1.0f)
     .build();
 
-SpeechPrompt speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", this.speechOptions);
-SpeechResponse response = openAiAudioSpeechModel.call(this.speechPrompt);
+SpeechPrompt speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", speechOptions);
+SpeechResponse response = openAiAudioSpeechModel.call(speechPrompt);
 ----
 
 == Manual Configuration
@@ -144,23 +144,25 @@ Next, create an `OpenAiAudioSpeechModel`:
 
 [source,java]
 ----
-var openAiAudioApi = new OpenAiAudioApi(System.getenv("OPENAI_API_KEY"));
+var openAiAudioApi = new OpenAiAudioApi()
+    .apiKey(System.getenv("OPENAI_API_KEY"))
+    .build();
 
-var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(this.openAiAudioApi);
+var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(openAiAudioApi);
 
 var speechOptions = OpenAiAudioSpeechOptions.builder()
     .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
     .speed(1.0f)
     .model(OpenAiAudioApi.TtsModel.TTS_1.value)
     .build();
 
-var speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", this.speechOptions);
-SpeechResponse response = this.openAiAudioSpeechModel.call(this.speechPrompt);
+var speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", speechOptions);
+SpeechResponse response = openAiAudioSpeechModel.call(speechPrompt);
 
 // Accessing metadata (rate limit info)
-OpenAiAudioSpeechResponseMetadata metadata = this.response.getMetadata();
+OpenAiAudioSpeechResponseMetadata metadata = response.getMetadata();
 
-byte[] responseAsBytes = this.response.getResult().getOutput();
+byte[] responseAsBytes = response.getResult().getOutput();
 ----
 
 == Streaming Real-time Audio
@@ -169,9 +171,11 @@ The Speech API provides support for real-time audio streaming using chunk transf
 
 [source,java]
 ----
-var openAiAudioApi = new OpenAiAudioApi(System.getenv("OPENAI_API_KEY"));
+var openAiAudioApi = new OpenAiAudioApi()
+    .apiKey(System.getenv("OPENAI_API_KEY"))
+    .build();
 
-var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(this.openAiAudioApi);
+var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(openAiAudioApi);
 
 OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
     .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
@@ -180,9 +184,9 @@ OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
     .model(OpenAiAudioApi.TtsModel.TTS_1.value)
     .build();
 
-SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", this.speechOptions);
+SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions);
 
-Flux<SpeechResponse> responseStream = this.openAiAudioSpeechModel.stream(this.speechPrompt);
+Flux<SpeechResponse> responseStream = openAiAudioSpeechModel.stream(speechPrompt);
 ----
 
 == Example Code