Skip to content

Commit beb8574

Browse files
committed
feat: add OpenAI transcribe and TTS models
Signed-off-by: Alexandros Pappas <[email protected]>
1 parent 53a7af5 commit beb8574

File tree

9 files changed

+67
-23
lines changed

9 files changed

+67
-23
lines changed

auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
3636

3737
public static final String CONFIG_PREFIX = "spring.ai.openai.audio.speech";
3838

39-
public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.TTS_1.getValue();
39+
public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue();
4040

4141
private static final Float SPEED = 1.0f;
4242

auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public class OpenAiAudioTranscriptionProperties extends OpenAiParentProperties {
2626

2727
public static final String CONFIG_PREFIX = "spring.ai.openai.audio.transcription";
2828

29-
public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.WhisperModel.WHISPER_1.getValue();
29+
public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue();
3030

3131
private static final Double DEFAULT_TEMPERATURE = 0.7;
3232

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel
8080
public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) {
8181
this(audioApi,
8282
OpenAiAudioSpeechOptions.builder()
83-
.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
83+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue())
8484
.responseFormat(AudioResponseFormat.MP3)
8585
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
8686
.speed(SPEED)

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ public class OpenAiAudioTranscriptionModel implements Model<AudioTranscriptionPr
6363
public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi) {
6464
this(audioApi,
6565
OpenAiAudioTranscriptionOptions.builder()
66-
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
66+
.model(OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue())
6767
.responseFormat(OpenAiAudioApi.TranscriptResponseFormat.JSON)
6868
.temperature(0.7f)
6969
.build());

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import reactor.core.publisher.Mono;
2727

2828
import org.springframework.ai.model.ApiKey;
29+
import org.springframework.ai.model.ChatModelDescription;
2930
import org.springframework.ai.model.NoopApiKey;
3031
import org.springframework.ai.model.SimpleApiKey;
3132
import org.springframework.ai.openai.api.common.OpenAiApiConstants;
@@ -213,22 +214,25 @@ public String getFilename() {
213214
* different model variates, tts-1 is optimized for real time text to speech use cases
214215
* and tts-1-hd is optimized for quality. These models can be used with the Speech
215216
* endpoint in the Audio API. Reference:
216-
* <a href="https://platform.openai.com/docs/models/tts">TTS</a>
217+
* <a href="https://platform.openai.com/docs/models#tts">TTS</a>
217218
*/
218219
public enum TtsModel {
219220

220-
// @formatter:off
221221
/**
222-
* The latest text to speech model, optimized for speed.
222+
* Text-to-speech model optimized for speed
223223
*/
224224
@JsonProperty("tts-1")
225225
TTS_1("tts-1"),
226226
/**
227-
* The latest text to speech model, optimized for quality.
227+
* Text-to-speech model optimized for quality.
228228
*/
229229
@JsonProperty("tts-1-hd")
230-
TTS_1_HD("tts-1-hd");
231-
// @formatter:on
230+
TTS_1_HD("tts-1-hd"),
231+
/**
232+
* Text-to-speech model powered by GPT-4o mini
233+
*/
234+
@JsonProperty("gpt-4o-mini-tts")
235+
GPT_4O_MINI_TTS("gpt-4o-mini-tts");
232236

233237
public final String value;
234238

@@ -250,6 +254,7 @@ public String getValue() {
250254
* v2-large model is currently available through our API with the whisper-1 model
251255
* name.
252256
*/
257+
@Deprecated
253258
public enum WhisperModel {
254259

255260
// @formatter:off
@@ -269,6 +274,45 @@ public String getValue() {
269274

270275
}
271276

277+
/**
278+
* The available models for the transcriptions API. Reference:
279+
* <a href="https://platform.openai.com/docs/models#transcription">
280+
*/
281+
public enum TranscriptionModels implements ChatModelDescription {
282+
283+
/**
284+
* Speech-to-text model powered by GPT-4o
285+
*/
286+
@JsonProperty("gpt-4o-transcribe")
287+
GPT_4O_TRANSCRIBE("gpt-4o-transcribe"),
288+
/**
289+
* Speech-to-text model powered by GPT-4o mini
290+
*/
291+
@JsonProperty("gpt-4o-mini-transcribe")
292+
GPT_4O_MINI_TRANSCRIBE("gpt-4o-mini-transcribe"),
293+
/**
294+
* General-purpose speech recognition model
295+
*/
296+
@JsonProperty("whisper-1")
297+
WHISPER_1("whisper-1");
298+
299+
public final String value;
300+
301+
TranscriptionModels(String value) {
302+
this.value = value;
303+
}
304+
305+
public String getValue() {
306+
return this.value;
307+
}
308+
309+
@Override
310+
public String getName() {
311+
return this.value;
312+
}
313+
314+
}
315+
272316
/**
273317
* The format of the transcript and translation outputs, in one of these options:
274318
* json, text, srt, verbose_json, or vtt. Defaults to json.
@@ -416,7 +460,7 @@ public String getValue() {
416460
*/
417461
public static class Builder {
418462

419-
private String model = TtsModel.TTS_1.getValue();
463+
private String model = TtsModel.GPT_4O_MINI_TTS.getValue();
420464

421465
private String input;
422466

@@ -531,7 +575,7 @@ public static class Builder {
531575

532576
private byte[] file;
533577

534-
private String model = WhisperModel.WHISPER_1.getValue();
578+
private String model = TranscriptionModels.WHISPER_1.getValue();
535579

536580
private String language;
537581

@@ -624,7 +668,7 @@ public static class Builder {
624668

625669
private byte[] file;
626670

627-
private String model = WhisperModel.WHISPER_1.getValue();
671+
private String model = TranscriptionModels.WHISPER_1.getValue();
628672

629673
private String prompt;
630674

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929
import org.springframework.ai.openai.api.OpenAiAudioApi.StructuredResponse;
3030
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionRequest;
3131
import org.springframework.ai.openai.api.OpenAiAudioApi.TranslationRequest;
32+
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionModels;
3233
import org.springframework.ai.openai.api.OpenAiAudioApi.TtsModel;
33-
import org.springframework.ai.openai.api.OpenAiAudioApi.WhisperModel;
3434
import org.springframework.util.FileCopyUtils;
3535

3636
import static org.assertj.core.api.Assertions.assertThat;
@@ -52,7 +52,7 @@ void speechTranscriptionAndTranslation() throws IOException {
5252

5353
byte[] speech = this.audioApi
5454
.createSpeech(SpeechRequest.builder()
55-
.model(TtsModel.TTS_1_HD.getValue())
55+
.model(TtsModel.GPT_4O_MINI_TTS.getValue())
5656
.input("Hello, my name is Chris and I love Spring A.I.")
5757
.voice(Voice.ONYX.getValue())
5858
.build())
@@ -64,15 +64,15 @@ void speechTranscriptionAndTranslation() throws IOException {
6464

6565
StructuredResponse translation = this.audioApi
6666
.createTranslation(
67-
TranslationRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(),
67+
TranslationRequest.builder().model(TranscriptionModels.WHISPER_1.getValue()).file(speech).build(),
6868
StructuredResponse.class)
6969
.getBody();
7070

7171
assertThat(translation.text().replaceAll(",", "")).isEqualTo("Hello my name is Chris and I love Spring AI.");
7272

7373
StructuredResponse transcriptionEnglish = this.audioApi
7474
.createTranscription(
75-
TranscriptionRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(),
75+
TranscriptionRequest.builder().model(TranscriptionModels.WHISPER_1.getValue()).file(speech).build(),
7676
StructuredResponse.class)
7777
.getBody();
7878

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ public class OpenAiAudioModelNoOpApiKeysIT {
4444
void checkNoOpKey() {
4545
assertThatThrownBy(() -> this.audioApi
4646
.createSpeech(OpenAiAudioApi.SpeechRequest.builder()
47-
.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
47+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue())
4848
.input("Hello, my name is Chris and I love Spring A.I.")
4949
.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX.getValue())
5050
.build())

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() {
6464
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
6565
.speed(SPEED)
6666
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
67-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
67+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
6868
.build();
6969
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
7070
speechOptions);
@@ -100,7 +100,7 @@ void speechRateLimitTest() {
100100
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
101101
.speed(SPEED)
102102
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
103-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
103+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
104104
.build();
105105
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
106106
speechOptions);
@@ -120,7 +120,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
120120
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
121121
.speed(SPEED)
122122
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
123-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
123+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
124124
.build();
125125

126126
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
@@ -142,7 +142,7 @@ void speechVoicesTest(String voice) {
142142
.voice(voice)
143143
.speed(SPEED)
144144
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
145-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
145+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
146146
.build();
147147
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
148148
speechOptions);

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ void aiResponseContainsImageResponseMetadata() {
7575
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
7676
.speed(SPEED)
7777
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
78-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
78+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
7979
.build();
8080

8181
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",

0 commit comments

Comments
 (0)