diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java index 1585684775b..e1b6b171875 100644 --- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java +++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java @@ -36,7 +36,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties { public static final String CONFIG_PREFIX = "spring.ai.openai.audio.speech"; - public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.TTS_1.getValue(); + public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue(); private static final Float SPEED = 1.0f; diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java index 19b11f41578..68b9a92d53b 100644 --- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java +++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java @@ -26,7 +26,7 @@ public class OpenAiAudioTranscriptionProperties extends OpenAiParentProperties { public static final String CONFIG_PREFIX = "spring.ai.openai.audio.transcription"; - public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.WhisperModel.WHISPER_1.getValue(); + public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue(); private static final Double DEFAULT_TEMPERATURE = 0.7; diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java index f9fcd006cb6..759eac07e09 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java @@ -80,7 +80,7 @@ public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) { this(audioApi, OpenAiAudioSpeechOptions.builder() - .model(OpenAiAudioApi.TtsModel.TTS_1.getValue()) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue()) .responseFormat(AudioResponseFormat.MP3) .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java index 8fbd75d4d39..365b25cffb8 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java @@ -63,7 +63,7 @@ public class OpenAiAudioTranscriptionModel implements TranscriptionModel { public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi) { this(audioApi, OpenAiAudioTranscriptionOptions.builder() - .model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue()) + .model(OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue()) .responseFormat(OpenAiAudioApi.TranscriptResponseFormat.JSON) .temperature(0.7f) .build()); diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java index 9177c365a8d..78f48344444 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java @@ -27,6 +27,7 @@ import reactor.core.publisher.Mono; import org.springframework.ai.model.ApiKey; +import org.springframework.ai.model.ChatModelDescription; import org.springframework.ai.model.NoopApiKey; import org.springframework.ai.model.SimpleApiKey; import org.springframework.ai.openai.api.common.OpenAiApiConstants; @@ -50,6 +51,7 @@ * @author Ilayaperumal Gopinathan * @author Jonghoon Park * @author Filip Hrisafov + * @author Alexandros Pappas * @since 0.8.1 */ public class OpenAiAudioApi { @@ -224,18 +226,18 @@ public String getFilename() { * different model variates, tts-1 is optimized for real time text to speech use cases * and tts-1-hd is optimized for quality. These models can be used with the Speech * endpoint in the Audio API. Reference: - * TTS + * TTS */ public enum TtsModel { // @formatter:off /** - * The latest text to speech model, optimized for speed. + * Text-to-speech model optimized for speed */ @JsonProperty("tts-1") TTS_1("tts-1"), /** - * The latest text to speech model, optimized for quality. + * Text-to-speech model optimized for quality. */ @JsonProperty("tts-1-hd") TTS_1_HD("tts-1-hd"), @@ -266,6 +268,7 @@ public String getValue() { * v2-large model is currently available through our API with the whisper-1 model * name. */ + @Deprecated public enum WhisperModel { // @formatter:off @@ -285,6 +288,45 @@ public String getValue() { } + /** + * The available models for the transcriptions API. Reference: + * + */ + public enum TranscriptionModels implements ChatModelDescription { + + /** + * Speech-to-text model powered by GPT-4o + */ + @JsonProperty("gpt-4o-transcribe") + GPT_4O_TRANSCRIBE("gpt-4o-transcribe"), + /** + * Speech-to-text model powered by GPT-4o mini + */ + @JsonProperty("gpt-4o-mini-transcribe") + GPT_4O_MINI_TRANSCRIBE("gpt-4o-mini-transcribe"), + /** + * General-purpose speech recognition model + */ + @JsonProperty("whisper-1") + WHISPER_1("whisper-1"); + + public final String value; + + TranscriptionModels(String value) { + this.value = value; + } + + public String getValue() { + return this.value; + } + + @Override + public String getName() { + return this.value; + } + + } + /** * The format of the transcript and translation outputs, in one of these options: * json, text, srt, verbose_json, or vtt. Defaults to json. @@ -437,7 +479,7 @@ public String getValue() { */ public static class Builder { - private String model = TtsModel.TTS_1.getValue(); + private String model = TtsModel.GPT_4_O_MINI_TTS.getValue(); private String input; @@ -556,7 +598,7 @@ public static class Builder { private String fileName; - private String model = WhisperModel.WHISPER_1.getValue(); + private String model = TranscriptionModels.WHISPER_1.getValue(); private String language; @@ -659,7 +701,7 @@ public static class Builder { private String fileName; - private String model = WhisperModel.WHISPER_1.getValue(); + private String model = TranscriptionModels.WHISPER_1.getValue(); private String prompt; diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java index 6533d15de56..0d67d20aa45 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java @@ -27,10 +27,10 @@ import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest; import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.Voice; import org.springframework.ai.openai.api.OpenAiAudioApi.StructuredResponse; +import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionModels; import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionRequest; import org.springframework.ai.openai.api.OpenAiAudioApi.TranslationRequest; import org.springframework.ai.openai.api.OpenAiAudioApi.TtsModel; -import org.springframework.ai.openai.api.OpenAiAudioApi.WhisperModel; import org.springframework.util.FileCopyUtils; import static org.assertj.core.api.Assertions.assertThat; @@ -52,7 +52,7 @@ void speechTranscriptionAndTranslation() throws IOException { byte[] speech = this.audioApi .createSpeech(SpeechRequest.builder() - .model(TtsModel.TTS_1_HD.getValue()) + .model(TtsModel.GPT_4_O_MINI_TTS.getValue()) .input("Hello, my name is Chris and I love Spring A.I.") .voice(Voice.ONYX.getValue()) .build()) @@ -64,7 +64,7 @@ void speechTranscriptionAndTranslation() throws IOException { StructuredResponse translation = this.audioApi .createTranslation(TranslationRequest.builder() - .model(WhisperModel.WHISPER_1.getValue()) + .model(TranscriptionModels.WHISPER_1.getValue()) .file(speech) .fileName("speech.mp3") .build(), StructuredResponse.class) @@ -74,7 +74,7 @@ void speechTranscriptionAndTranslation() throws IOException { StructuredResponse transcriptionEnglish = this.audioApi .createTranscription(TranscriptionRequest.builder() - .model(WhisperModel.WHISPER_1.getValue()) + .model(TranscriptionModels.WHISPER_1.getValue()) .file(speech) .fileName("speech.mp3") .build(), StructuredResponse.class) diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java index 996035d09b7..dd1a1c3b9e5 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java @@ -44,7 +44,7 @@ public class OpenAiAudioModelNoOpApiKeysIT { void checkNoOpKey() { assertThatThrownBy(() -> this.audioApi .createSpeech(OpenAiAudioApi.SpeechRequest.builder() - .model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue()) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue()) .input("Hello, my name is Chris and I love Spring A.I.") .voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX.getValue()) .build()) diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java index 7f4a81d3e3c..67310709492 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java @@ -64,7 +64,7 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() { .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); @@ -100,7 +100,7 @@ void speechRateLimitTest() { .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); @@ -120,7 +120,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() { .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", @@ -142,7 +142,7 @@ void speechVoicesTest(String voice) { .voice(voice) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java index 9d98b5d9b58..51df242073d 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java @@ -75,7 +75,7 @@ void aiResponseContainsImageResponseMetadata() { .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc index 6b6060c8f56..0c2758fd556 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc @@ -85,7 +85,7 @@ The prefix `spring.ai.openai.audio.speech` is used as the property prefix that l | spring.ai.openai.audio.speech.api-key | The API Key | - | spring.ai.openai.audio.speech.organization-id | Optionally you can specify which organization used for an API request. | - | spring.ai.openai.audio.speech.project-id | Optionally, you can specify which project is used for an API request. | - -| spring.ai.openai.audio.speech.options.model | ID of the model to use for generating the audio. For OpenAI's TTS API, use one of the available models: tts-1 or tts-1-hd. | tts-1 +| spring.ai.openai.audio.speech.options.model | ID of the model to use for generating the audio. Available models: `gpt-4o-mini-tts` (default, optimized for speed and cost), `gpt-4o-tts` (higher quality), `tts-1` (legacy, optimized for speed), or `tts-1-hd` (legacy, optimized for quality). | gpt-4o-mini-tts | spring.ai.openai.audio.speech.options.voice | The voice to use for synthesis. For OpenAI's TTS API, One of the available voices for the chosen model: alloy, echo, fable, onyx, nova, and shimmer. | alloy | spring.ai.openai.audio.speech.options.response-format | The format of the audio output. Supported formats are mp3, opus, aac, flac, wav, and pcm. | mp3 | spring.ai.openai.audio.speech.options.speed | The speed of the voice synthesis. The acceptable range is from 0.25 (slowest) to 4.0 (fastest). | 1.0 @@ -107,7 +107,7 @@ For example: [source,java] ---- OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder() - .model("tts-1") + .model("gpt-4o-mini-tts") .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) .speed(1.0f) @@ -153,7 +153,7 @@ var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(openAiAudioApi); var speechOptions = OpenAiAudioSpeechOptions.builder() .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) .speed(1.0f) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); var speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", speechOptions); @@ -181,7 +181,7 @@ OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder() .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY) .speed(1.0f) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/transcriptions/openai-transcriptions.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/transcriptions/openai-transcriptions.adoc index f993e499de0..139794166c7 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/transcriptions/openai-transcriptions.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/transcriptions/openai-transcriptions.adoc @@ -83,7 +83,7 @@ The prefix `spring.ai.openai.audio.transcription` is used as the property prefix | spring.ai.openai.audio.transcription.api-key | The API Key | - | spring.ai.openai.audio.transcription.organization-id | Optionally you can specify which organization used for an API request. | - | spring.ai.openai.audio.transcription.project-id | Optionally, you can specify which project is used for an API request. | - -| spring.ai.openai.audio.transcription.options.model | ID of the model to use. Only whisper-1 (which is powered by our open source Whisper V2 model) is currently available. | whisper-1 +| spring.ai.openai.audio.transcription.options.model | ID of the model to use for transcription. Available models: `gpt-4o-transcribe` (speech-to-text powered by GPT-4o), `gpt-4o-mini-transcribe` (speech-to-text powered by GPT-4o mini), or `whisper-1` (general-purpose speech recognition model, default). | whisper-1 | spring.ai.openai.audio.transcription.options.response-format | The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt. | json | spring.ai.openai.audio.transcription.options.prompt | An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. | | spring.ai.openai.audio.transcription.options.language | The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. |