diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java
index 1585684775b..e1b6b171875 100644
--- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java
+++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java
@@ -36,7 +36,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
public static final String CONFIG_PREFIX = "spring.ai.openai.audio.speech";
- public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.TTS_1.getValue();
+ public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue();
private static final Float SPEED = 1.0f;
diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java
index 19b11f41578..68b9a92d53b 100644
--- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java
+++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java
@@ -26,7 +26,7 @@ public class OpenAiAudioTranscriptionProperties extends OpenAiParentProperties {
public static final String CONFIG_PREFIX = "spring.ai.openai.audio.transcription";
- public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.WhisperModel.WHISPER_1.getValue();
+ public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue();
private static final Double DEFAULT_TEMPERATURE = 0.7;
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java
index f9fcd006cb6..759eac07e09 100644
--- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java
+++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java
@@ -80,7 +80,7 @@ public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel
public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) {
this(audioApi,
OpenAiAudioSpeechOptions.builder()
- .model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue())
.responseFormat(AudioResponseFormat.MP3)
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java
index 8fbd75d4d39..365b25cffb8 100644
--- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java
+++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java
@@ -63,7 +63,7 @@ public class OpenAiAudioTranscriptionModel implements TranscriptionModel {
public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi) {
this(audioApi,
OpenAiAudioTranscriptionOptions.builder()
- .model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
+ .model(OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue())
.responseFormat(OpenAiAudioApi.TranscriptResponseFormat.JSON)
.temperature(0.7f)
.build());
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java
index 9177c365a8d..78f48344444 100644
--- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java
+++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java
@@ -27,6 +27,7 @@
import reactor.core.publisher.Mono;
import org.springframework.ai.model.ApiKey;
+import org.springframework.ai.model.ChatModelDescription;
import org.springframework.ai.model.NoopApiKey;
import org.springframework.ai.model.SimpleApiKey;
import org.springframework.ai.openai.api.common.OpenAiApiConstants;
@@ -50,6 +51,7 @@
* @author Ilayaperumal Gopinathan
* @author Jonghoon Park
* @author Filip Hrisafov
+ * @author Alexandros Pappas
* @since 0.8.1
*/
public class OpenAiAudioApi {
@@ -224,18 +226,18 @@ public String getFilename() {
* different model variates, tts-1 is optimized for real time text to speech use cases
* and tts-1-hd is optimized for quality. These models can be used with the Speech
* endpoint in the Audio API. Reference:
- * TTS
+ * TTS
*/
public enum TtsModel {
// @formatter:off
/**
- * The latest text to speech model, optimized for speed.
+ * Text-to-speech model optimized for speed
*/
@JsonProperty("tts-1")
TTS_1("tts-1"),
/**
- * The latest text to speech model, optimized for quality.
+ * Text-to-speech model optimized for quality.
*/
@JsonProperty("tts-1-hd")
TTS_1_HD("tts-1-hd"),
@@ -266,6 +268,7 @@ public String getValue() {
* v2-large model is currently available through our API with the whisper-1 model
* name.
*/
+ @Deprecated
public enum WhisperModel {
// @formatter:off
@@ -285,6 +288,45 @@ public String getValue() {
}
+ /**
+ * The available models for the transcriptions API. Reference:
+ *
+ */
+ public enum TranscriptionModels implements ChatModelDescription {
+
+ /**
+ * Speech-to-text model powered by GPT-4o
+ */
+ @JsonProperty("gpt-4o-transcribe")
+ GPT_4O_TRANSCRIBE("gpt-4o-transcribe"),
+ /**
+ * Speech-to-text model powered by GPT-4o mini
+ */
+ @JsonProperty("gpt-4o-mini-transcribe")
+ GPT_4O_MINI_TRANSCRIBE("gpt-4o-mini-transcribe"),
+ /**
+ * General-purpose speech recognition model
+ */
+ @JsonProperty("whisper-1")
+ WHISPER_1("whisper-1");
+
+ public final String value;
+
+ TranscriptionModels(String value) {
+ this.value = value;
+ }
+
+ public String getValue() {
+ return this.value;
+ }
+
+ @Override
+ public String getName() {
+ return this.value;
+ }
+
+ }
+
/**
* The format of the transcript and translation outputs, in one of these options:
* json, text, srt, verbose_json, or vtt. Defaults to json.
@@ -437,7 +479,7 @@ public String getValue() {
*/
public static class Builder {
- private String model = TtsModel.TTS_1.getValue();
+ private String model = TtsModel.GPT_4_O_MINI_TTS.getValue();
private String input;
@@ -556,7 +598,7 @@ public static class Builder {
private String fileName;
- private String model = WhisperModel.WHISPER_1.getValue();
+ private String model = TranscriptionModels.WHISPER_1.getValue();
private String language;
@@ -659,7 +701,7 @@ public static class Builder {
private String fileName;
- private String model = WhisperModel.WHISPER_1.getValue();
+ private String model = TranscriptionModels.WHISPER_1.getValue();
private String prompt;
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java
index 6533d15de56..0d67d20aa45 100644
--- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java
+++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java
@@ -27,10 +27,10 @@
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest;
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.Voice;
import org.springframework.ai.openai.api.OpenAiAudioApi.StructuredResponse;
+import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionModels;
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionRequest;
import org.springframework.ai.openai.api.OpenAiAudioApi.TranslationRequest;
import org.springframework.ai.openai.api.OpenAiAudioApi.TtsModel;
-import org.springframework.ai.openai.api.OpenAiAudioApi.WhisperModel;
import org.springframework.util.FileCopyUtils;
import static org.assertj.core.api.Assertions.assertThat;
@@ -52,7 +52,7 @@ void speechTranscriptionAndTranslation() throws IOException {
byte[] speech = this.audioApi
.createSpeech(SpeechRequest.builder()
- .model(TtsModel.TTS_1_HD.getValue())
+ .model(TtsModel.GPT_4_O_MINI_TTS.getValue())
.input("Hello, my name is Chris and I love Spring A.I.")
.voice(Voice.ONYX.getValue())
.build())
@@ -64,7 +64,7 @@ void speechTranscriptionAndTranslation() throws IOException {
StructuredResponse translation = this.audioApi
.createTranslation(TranslationRequest.builder()
- .model(WhisperModel.WHISPER_1.getValue())
+ .model(TranscriptionModels.WHISPER_1.getValue())
.file(speech)
.fileName("speech.mp3")
.build(), StructuredResponse.class)
@@ -74,7 +74,7 @@ void speechTranscriptionAndTranslation() throws IOException {
StructuredResponse transcriptionEnglish = this.audioApi
.createTranscription(TranscriptionRequest.builder()
- .model(WhisperModel.WHISPER_1.getValue())
+ .model(TranscriptionModels.WHISPER_1.getValue())
.file(speech)
.fileName("speech.mp3")
.build(), StructuredResponse.class)
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java
index 996035d09b7..dd1a1c3b9e5 100644
--- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java
+++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java
@@ -44,7 +44,7 @@ public class OpenAiAudioModelNoOpApiKeysIT {
void checkNoOpKey() {
assertThatThrownBy(() -> this.audioApi
.createSpeech(OpenAiAudioApi.SpeechRequest.builder()
- .model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue())
.input("Hello, my name is Chris and I love Spring A.I.")
.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX.getValue())
.build())
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java
index 7f4a81d3e3c..67310709492 100644
--- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java
+++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java
@@ -64,7 +64,7 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() {
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
- .model(OpenAiAudioApi.TtsModel.TTS_1.value)
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
speechOptions);
@@ -100,7 +100,7 @@ void speechRateLimitTest() {
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
- .model(OpenAiAudioApi.TtsModel.TTS_1.value)
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
speechOptions);
@@ -120,7 +120,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
- .model(OpenAiAudioApi.TtsModel.TTS_1.value)
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
@@ -142,7 +142,7 @@ void speechVoicesTest(String voice) {
.voice(voice)
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
- .model(OpenAiAudioApi.TtsModel.TTS_1.value)
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
speechOptions);
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java
index 9d98b5d9b58..51df242073d 100644
--- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java
+++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java
@@ -75,7 +75,7 @@ void aiResponseContainsImageResponseMetadata() {
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
- .model(OpenAiAudioApi.TtsModel.TTS_1.value)
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc
index 6b6060c8f56..0c2758fd556 100644
--- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc
+++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc
@@ -85,7 +85,7 @@ The prefix `spring.ai.openai.audio.speech` is used as the property prefix that l
| spring.ai.openai.audio.speech.api-key | The API Key | -
| spring.ai.openai.audio.speech.organization-id | Optionally you can specify which organization used for an API request. | -
| spring.ai.openai.audio.speech.project-id | Optionally, you can specify which project is used for an API request. | -
-| spring.ai.openai.audio.speech.options.model | ID of the model to use for generating the audio. For OpenAI's TTS API, use one of the available models: tts-1 or tts-1-hd. | tts-1
+| spring.ai.openai.audio.speech.options.model | ID of the model to use for generating the audio. Available models: `gpt-4o-mini-tts` (default, optimized for speed and cost), `gpt-4o-tts` (higher quality), `tts-1` (legacy, optimized for speed), or `tts-1-hd` (legacy, optimized for quality). | gpt-4o-mini-tts
| spring.ai.openai.audio.speech.options.voice | The voice to use for synthesis. For OpenAI's TTS API, One of the available voices for the chosen model: alloy, echo, fable, onyx, nova, and shimmer. | alloy
| spring.ai.openai.audio.speech.options.response-format | The format of the audio output. Supported formats are mp3, opus, aac, flac, wav, and pcm. | mp3
| spring.ai.openai.audio.speech.options.speed | The speed of the voice synthesis. The acceptable range is from 0.25 (slowest) to 4.0 (fastest). | 1.0
@@ -107,7 +107,7 @@ For example:
[source,java]
----
OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
- .model("tts-1")
+ .model("gpt-4o-mini-tts")
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.speed(1.0f)
@@ -153,7 +153,7 @@ var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(openAiAudioApi);
var speechOptions = OpenAiAudioSpeechOptions.builder()
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.speed(1.0f)
- .model(OpenAiAudioApi.TtsModel.TTS_1.value)
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
var speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", speechOptions);
@@ -181,7 +181,7 @@ OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
.speed(1.0f)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
- .model(OpenAiAudioApi.TtsModel.TTS_1.value)
+ .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions);
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/transcriptions/openai-transcriptions.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/transcriptions/openai-transcriptions.adoc
index f993e499de0..139794166c7 100644
--- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/transcriptions/openai-transcriptions.adoc
+++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/transcriptions/openai-transcriptions.adoc
@@ -83,7 +83,7 @@ The prefix `spring.ai.openai.audio.transcription` is used as the property prefix
| spring.ai.openai.audio.transcription.api-key | The API Key | -
| spring.ai.openai.audio.transcription.organization-id | Optionally you can specify which organization used for an API request. | -
| spring.ai.openai.audio.transcription.project-id | Optionally, you can specify which project is used for an API request. | -
-| spring.ai.openai.audio.transcription.options.model | ID of the model to use. Only whisper-1 (which is powered by our open source Whisper V2 model) is currently available. | whisper-1
+| spring.ai.openai.audio.transcription.options.model | ID of the model to use for transcription. Available models: `gpt-4o-transcribe` (speech-to-text powered by GPT-4o), `gpt-4o-mini-transcribe` (speech-to-text powered by GPT-4o mini), or `whisper-1` (general-purpose speech recognition model, default). | whisper-1
| spring.ai.openai.audio.transcription.options.response-format | The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt. | json
| spring.ai.openai.audio.transcription.options.prompt | An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. |
| spring.ai.openai.audio.transcription.options.language | The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. |