Skip to content

Commit bdb7ea5

Browse files
apappascsericbottard
authored andcommitted
Add support for more OpenAI transcribe and TTS models
reference https://openai.com/index/introducing-our-next-generation-audio-models/ Signed-off-by: Alexandros Pappas <[email protected]> Signed-off-by: Eric Bottard <[email protected]>
1 parent e0a7d74 commit bdb7ea5

File tree

11 files changed

+71
-25
lines changed

11 files changed

+71
-25
lines changed

auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
3636

3737
public static final String CONFIG_PREFIX = "spring.ai.openai.audio.speech";
3838

39-
public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.TTS_1.getValue();
39+
public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue();
4040

4141
private static final Float SPEED = 1.0f;
4242

auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public class OpenAiAudioTranscriptionProperties extends OpenAiParentProperties {
2626

2727
public static final String CONFIG_PREFIX = "spring.ai.openai.audio.transcription";
2828

29-
public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.WhisperModel.WHISPER_1.getValue();
29+
public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue();
3030

3131
private static final Double DEFAULT_TEMPERATURE = 0.7;
3232

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel
8080
public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) {
8181
this(audioApi,
8282
OpenAiAudioSpeechOptions.builder()
83-
.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
83+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue())
8484
.responseFormat(AudioResponseFormat.MP3)
8585
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
8686
.speed(SPEED)

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ public class OpenAiAudioTranscriptionModel implements TranscriptionModel {
6363
public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi) {
6464
this(audioApi,
6565
OpenAiAudioTranscriptionOptions.builder()
66-
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
66+
.model(OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue())
6767
.responseFormat(OpenAiAudioApi.TranscriptResponseFormat.JSON)
6868
.temperature(0.7f)
6969
.build());

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import reactor.core.publisher.Mono;
2828

2929
import org.springframework.ai.model.ApiKey;
30+
import org.springframework.ai.model.ChatModelDescription;
3031
import org.springframework.ai.model.NoopApiKey;
3132
import org.springframework.ai.model.SimpleApiKey;
3233
import org.springframework.ai.openai.api.common.OpenAiApiConstants;
@@ -50,6 +51,7 @@
5051
* @author Ilayaperumal Gopinathan
5152
* @author Jonghoon Park
5253
* @author Filip Hrisafov
54+
* @author Alexandros Pappas
5355
* @since 0.8.1
5456
*/
5557
public class OpenAiAudioApi {
@@ -224,18 +226,18 @@ public String getFilename() {
224226
* different model variates, tts-1 is optimized for real time text to speech use cases
225227
* and tts-1-hd is optimized for quality. These models can be used with the Speech
226228
* endpoint in the Audio API. Reference:
227-
* <a href="https://platform.openai.com/docs/models/tts">TTS</a>
229+
* <a href="https://platform.openai.com/docs/models#tts">TTS</a>
228230
*/
229231
public enum TtsModel {
230232

231233
// @formatter:off
232234
/**
233-
* The latest text to speech model, optimized for speed.
235+
* Text-to-speech model optimized for speed
234236
*/
235237
@JsonProperty("tts-1")
236238
TTS_1("tts-1"),
237239
/**
238-
* The latest text to speech model, optimized for quality.
240+
* Text-to-speech model optimized for quality.
239241
*/
240242
@JsonProperty("tts-1-hd")
241243
TTS_1_HD("tts-1-hd"),
@@ -265,7 +267,10 @@ public String getValue() {
265267
* recognition as well as speech translation and language identification. The Whisper
266268
* v2-large model is currently available through our API with the whisper-1 model
267269
* name.
270+
*
271+
* @deprecated See {@link TranscriptionModels#WHISPER_1}
268272
*/
273+
@Deprecated
269274
public enum WhisperModel {
270275

271276
// @formatter:off
@@ -285,6 +290,47 @@ public String getValue() {
285290

286291
}
287292

293+
/**
294+
* The available models for the transcriptions API. Reference:
295+
* <a href="https://platform.openai.com/docs/models#transcription">
296+
*/
297+
public enum TranscriptionModels implements ChatModelDescription {
298+
299+
/**
300+
* Speech-to-text model powered by GPT-4o
301+
*/
302+
@JsonProperty("gpt-4o-transcribe")
303+
GPT_4O_TRANSCRIBE("gpt-4o-transcribe"),
304+
305+
/**
306+
* Speech-to-text model powered by GPT-4o mini
307+
*/
308+
@JsonProperty("gpt-4o-mini-transcribe")
309+
GPT_4O_MINI_TRANSCRIBE("gpt-4o-mini-transcribe"),
310+
311+
/**
312+
* General-purpose speech recognition model
313+
*/
314+
@JsonProperty("whisper-1")
315+
WHISPER_1("whisper-1");
316+
317+
public final String value;
318+
319+
TranscriptionModels(String value) {
320+
this.value = value;
321+
}
322+
323+
public String getValue() {
324+
return this.value;
325+
}
326+
327+
@Override
328+
public String getName() {
329+
return this.value;
330+
}
331+
332+
}
333+
288334
/**
289335
* The format of the transcript and translation outputs, in one of these options:
290336
* json, text, srt, verbose_json, or vtt. Defaults to json.
@@ -437,7 +483,7 @@ public String getValue() {
437483
*/
438484
public static final class Builder {
439485

440-
private String model = TtsModel.TTS_1.getValue();
486+
private String model = TtsModel.GPT_4_O_MINI_TTS.getValue();
441487

442488
private String input;
443489

@@ -556,7 +602,7 @@ public static final class Builder {
556602

557603
private String fileName;
558604

559-
private String model = WhisperModel.WHISPER_1.getValue();
605+
private String model = TranscriptionModels.WHISPER_1.getValue();
560606

561607
private String language;
562608

@@ -659,7 +705,7 @@ public static final class Builder {
659705

660706
private String fileName;
661707

662-
private String model = WhisperModel.WHISPER_1.getValue();
708+
private String model = TranscriptionModels.WHISPER_1.getValue();
663709

664710
private String prompt;
665711

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,10 @@
2727
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest;
2828
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.Voice;
2929
import org.springframework.ai.openai.api.OpenAiAudioApi.StructuredResponse;
30+
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionModels;
3031
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionRequest;
3132
import org.springframework.ai.openai.api.OpenAiAudioApi.TranslationRequest;
3233
import org.springframework.ai.openai.api.OpenAiAudioApi.TtsModel;
33-
import org.springframework.ai.openai.api.OpenAiAudioApi.WhisperModel;
3434
import org.springframework.util.FileCopyUtils;
3535

3636
import static org.assertj.core.api.Assertions.assertThat;
@@ -52,7 +52,7 @@ void speechTranscriptionAndTranslation() throws IOException {
5252

5353
byte[] speech = this.audioApi
5454
.createSpeech(SpeechRequest.builder()
55-
.model(TtsModel.TTS_1_HD.getValue())
55+
.model(TtsModel.GPT_4_O_MINI_TTS.getValue())
5656
.input("Hello, my name is Chris and I love Spring A.I.")
5757
.voice(Voice.ONYX.getValue())
5858
.build())
@@ -64,7 +64,7 @@ void speechTranscriptionAndTranslation() throws IOException {
6464

6565
StructuredResponse translation = this.audioApi
6666
.createTranslation(TranslationRequest.builder()
67-
.model(WhisperModel.WHISPER_1.getValue())
67+
.model(TranscriptionModels.WHISPER_1.getValue())
6868
.file(speech)
6969
.fileName("speech.mp3")
7070
.build(), StructuredResponse.class)
@@ -74,7 +74,7 @@ void speechTranscriptionAndTranslation() throws IOException {
7474

7575
StructuredResponse transcriptionEnglish = this.audioApi
7676
.createTranscription(TranscriptionRequest.builder()
77-
.model(WhisperModel.WHISPER_1.getValue())
77+
.model(TranscriptionModels.WHISPER_1.getValue())
7878
.file(speech)
7979
.fileName("speech.mp3")
8080
.build(), StructuredResponse.class)

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ public class OpenAiAudioModelNoOpApiKeysIT {
4444
void checkNoOpKey() {
4545
assertThatThrownBy(() -> this.audioApi
4646
.createSpeech(OpenAiAudioApi.SpeechRequest.builder()
47-
.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
47+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue())
4848
.input("Hello, my name is Chris and I love Spring A.I.")
4949
.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX.getValue())
5050
.build())

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() {
6464
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
6565
.speed(SPEED)
6666
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
67-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
67+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
6868
.build();
6969
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
7070
speechOptions);
@@ -100,7 +100,7 @@ void speechRateLimitTest() {
100100
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
101101
.speed(SPEED)
102102
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
103-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
103+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
104104
.build();
105105
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
106106
speechOptions);
@@ -120,7 +120,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
120120
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
121121
.speed(SPEED)
122122
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
123-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
123+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
124124
.build();
125125

126126
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
@@ -142,7 +142,7 @@ void speechVoicesTest(String voice) {
142142
.voice(voice)
143143
.speed(SPEED)
144144
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
145-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
145+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
146146
.build();
147147
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
148148
speechOptions);

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ void aiResponseContainsImageResponseMetadata() {
7575
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
7676
.speed(SPEED)
7777
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
78-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
78+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
7979
.build();
8080

8181
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",

spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ The prefix `spring.ai.openai.audio.speech` is used as the property prefix that l
8585
| spring.ai.openai.audio.speech.api-key | The API Key | -
8686
| spring.ai.openai.audio.speech.organization-id | Optionally you can specify which organization used for an API request. | -
8787
| spring.ai.openai.audio.speech.project-id | Optionally, you can specify which project is used for an API request. | -
88-
| spring.ai.openai.audio.speech.options.model | ID of the model to use for generating the audio. For OpenAI's TTS API, use one of the available models: tts-1 or tts-1-hd. | tts-1
88+
| spring.ai.openai.audio.speech.options.model | ID of the model to use for generating the audio. Available models: `gpt-4o-mini-tts` (default, optimized for speed and cost), `gpt-4o-tts` (higher quality), `tts-1` (legacy, optimized for speed), or `tts-1-hd` (legacy, optimized for quality). | gpt-4o-mini-tts
8989
| spring.ai.openai.audio.speech.options.voice | The voice to use for synthesis. For OpenAI's TTS API, One of the available voices for the chosen model: alloy, echo, fable, onyx, nova, and shimmer. | alloy
9090
| spring.ai.openai.audio.speech.options.response-format | The format of the audio output. Supported formats are mp3, opus, aac, flac, wav, and pcm. | mp3
9191
| spring.ai.openai.audio.speech.options.speed | The speed of the voice synthesis. The acceptable range is from 0.25 (slowest) to 4.0 (fastest). | 1.0
@@ -107,7 +107,7 @@ For example:
107107
[source,java]
108108
----
109109
OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
110-
.model("tts-1")
110+
.model("gpt-4o-mini-tts")
111111
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
112112
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
113113
.speed(1.0f)
@@ -153,7 +153,7 @@ var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(openAiAudioApi);
153153
var speechOptions = OpenAiAudioSpeechOptions.builder()
154154
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
155155
.speed(1.0f)
156-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
156+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
157157
.build();
158158
159159
var speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", speechOptions);
@@ -181,7 +181,7 @@ OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
181181
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
182182
.speed(1.0f)
183183
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
184-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
184+
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
185185
.build();
186186
187187
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions);

0 commit comments

Comments
 (0)