spring-projects
diff --git a/‎models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiChatModel.java‎
Lines changed: 44 additions & 7 deletions b/‎models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiChatModel.java‎
Lines changed: 44 additions & 7 deletions
diff --git a/‎models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiChatOptions.java‎
Lines changed: 53 additions & 2 deletions b/‎models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiChatOptions.java‎
Lines changed: 53 additions & 2 deletions
diff --git a/‎models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiApi.java‎
Lines changed: 28 additions & 28 deletions b/‎models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiApi.java‎
Lines changed: 28 additions & 28 deletions
diff --git a/‎models/spring-ai-openai/src/test/java/org/springframework/ai/openai/api/OpenAiApiIT.java‎
Lines changed: 21 additions & 1 deletion b/‎models/spring-ai-openai/src/test/java/org/springframework/ai/openai/api/OpenAiApiIT.java‎
Lines changed: 21 additions & 1 deletion
@@ -64,6 +64,7 @@
 import org.springframework.ai.openai.api.OpenAiApi.ChatCompletion;
 import org.springframework.ai.openai.api.OpenAiApi.ChatCompletion.Choice;
 import org.springframework.ai.openai.api.OpenAiApi.ChatCompletionMessage;
+import org.springframework.ai.openai.api.OpenAiApi.ChatCompletionMessage.AudioOutput;
 import org.springframework.ai.openai.api.OpenAiApi.ChatCompletionMessage.ChatCompletionFunction;
 import org.springframework.ai.openai.api.OpenAiApi.ChatCompletionMessage.MediaContent;
 import org.springframework.ai.openai.api.OpenAiApi.ChatCompletionMessage.ToolCall;
@@ -72,6 +73,8 @@
 import org.springframework.ai.openai.metadata.OpenAiUsage;
 import org.springframework.ai.openai.metadata.support.OpenAiResponseHeaderExtractor;
 import org.springframework.ai.retry.RetryUtils;
+import org.springframework.core.io.ByteArrayResource;
+import org.springframework.core.io.Resource;
 import org.springframework.http.ResponseEntity;
 import org.springframework.retry.support.RetryTemplate;
 import org.springframework.util.Assert;
@@ -251,7 +254,7 @@ public ChatResponse call(Prompt prompt) {
 							"finishReason", choice.finishReason() != null ? choice.finishReason().name() : "",
 							"refusal", StringUtils.hasText(choice.message().refusal()) ? choice.message().refusal() : "");
 					// @formatter:on
-					return buildGeneration(choice, metadata);
+					return buildGeneration(choice, metadata, request);
 				}).toList();
 
 				// Non function calling.
@@ -282,6 +285,17 @@ public Flux<ChatResponse> stream(Prompt prompt) {
 		return Flux.deferContextual(contextView -> {
 			ChatCompletionRequest request = createRequest(prompt, true);
 
+			if (request.outputModalities() != null) {
+				if (request.outputModalities().stream().anyMatch(m -> m.equals("audio"))) {
+					logger.warn("Audio output is not supported for streaming requests. Removing audio output.");
+					throw new IllegalArgumentException("Audio output is not supported for streaming requests.");
+				}
+			}
+			if (request.audioParameters() != null) {
+				logger.warn("Audio parameters are not supported for streaming requests. Removing audio parameters.");
+				throw new IllegalArgumentException("Audio parameters are not supported for streaming requests.");
+			}
+
 			Flux<OpenAiApi.ChatCompletionChunk> completionChunks = this.openAiApi.chatCompletionStream(request,
 					getAdditionalHttpHeaders(prompt));
 
@@ -320,7 +334,7 @@ public Flux<ChatResponse> stream(Prompt prompt) {
 									"finishReason", choice.finishReason() != null ? choice.finishReason().name() : "",
 									"refusal", StringUtils.hasText(choice.message().refusal()) ? choice.message().refusal() : "");
 
-							return buildGeneration(choice, metadata);
+							return buildGeneration(choice, metadata, request);
 						}).toList();
 						// @formatter:on
 
@@ -367,7 +381,7 @@ private MultiValueMap<String, String> getAdditionalHttpHeaders(Prompt prompt) {
 				headers.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> List.of(e.getValue()))));
 	}
 
-	private Generation buildGeneration(Choice choice, Map<String, Object> metadata) {
+	private Generation buildGeneration(Choice choice, Map<String, Object> metadata, ChatCompletionRequest request) {
 		List<AssistantMessage.ToolCall> toolCalls = choice.message().toolCalls() == null ? List.of()
 				: choice.message()
 					.toolCalls()
@@ -376,10 +390,26 @@ private Generation buildGeneration(Choice choice, Map<String, Object> metadata)
 							toolCall.function().name(), toolCall.function().arguments()))
 					.toList();
 
-		var assistantMessage = new AssistantMessage(choice.message().content(), metadata, toolCalls);
 		String finishReason = (choice.finishReason() != null ? choice.finishReason().name() : "");
-		var generationMetadata = ChatGenerationMetadata.builder().finishReason(finishReason).build();
-		return new Generation(assistantMessage, generationMetadata);
+		var generationMetadataBuilder = ChatGenerationMetadata.builder().finishReason(finishReason);
+
+		List<Media> media = new ArrayList<>();
+		String textContent = choice.message().content();
+		var audioOutput = choice.message().audioOutput();
+		if (audioOutput != null) {
+			String mimeType = String.format("audio/%s", request.audioParameters().format().name().toLowerCase());
+			byte[] audioData = Base64.getDecoder().decode(audioOutput.data());
+			Resource resource = new ByteArrayResource(audioData);
+			media.add(new Media(MimeTypeUtils.parseMimeType(mimeType), resource, audioOutput.id()));
+			if (!StringUtils.hasText(textContent)) {
+				textContent = audioOutput.transcript();
+			}
+			generationMetadataBuilder.metadata("audioId", audioOutput.id());
+			generationMetadataBuilder.metadata("audioExpiresAt", audioOutput.expiresAt());
+		}
+
+		var assistantMessage = new AssistantMessage(textContent, metadata, toolCalls, media);
+		return new Generation(assistantMessage, generationMetadataBuilder.build());
 	}
 
 	private ChatResponseMetadata from(OpenAiApi.ChatCompletion result, RateLimit rateLimit) {
@@ -443,8 +473,15 @@ else if (message.getMessageType() == MessageType.ASSISTANT) {
 						return new ToolCall(toolCall.id(), toolCall.type(), function);
 					}).toList();
 				}
+				AudioOutput audioOutput = null;
+				if (!CollectionUtils.isEmpty(assistantMessage.getMedia())) {
+					Assert.isTrue(assistantMessage.getMedia().size() == 1,
+							"Only one media content is supported for assistant messages");
+					audioOutput = new AudioOutput(assistantMessage.getMedia().get(0).getId(), null, null, null);
+
+				}
 				return List.of(new ChatCompletionMessage(assistantMessage.getContent(),
-						ChatCompletionMessage.Role.ASSISTANT, null, null, toolCalls, null, null));
+						ChatCompletionMessage.Role.ASSISTANT, null, null, toolCalls, null, audioOutput));
 			}
 			else if (message.getMessageType() == MessageType.TOOL) {
 				ToolResponseMessage toolMessage = (ToolResponseMessage) message;
 
@@ -33,6 +33,7 @@
 import org.springframework.ai.model.function.FunctionCallback;
 import org.springframework.ai.model.function.FunctionCallingOptions;
 import org.springframework.ai.openai.api.OpenAiApi;
+import org.springframework.ai.openai.api.OpenAiApi.ChatCompletionRequest.AudioParameters;
 import org.springframework.ai.openai.api.OpenAiApi.ChatCompletionRequest.StreamOptions;
 import org.springframework.ai.openai.api.OpenAiApi.ChatCompletionRequest.ToolChoiceBuilder;
 import org.springframework.ai.openai.api.ResponseFormat;
@@ -92,6 +93,27 @@ public class OpenAiChatOptions implements FunctionCallingOptions {
 	 * on the number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
 	 */
 	private @JsonProperty("n") Integer n;
+
+	/**
+	 * Output types that you would like the model to generate for this request.
+	 * Most models are capable of generating text, which is the default.
+	 * The gpt-4o-audio-preview model can also be used to generate audio.
+	 * To request that this model generate both text and audio responses,
+	 * you can use: ["text", "audio"].
+	 * Note that the audio modality is only available for the gpt-4o-audio-preview model 
+	 * and is not supported for streaming completions.
+	 */
+	private @JsonProperty("modalities") List<String> modalities;
+
+	/**
+	 * Audio parameters for the audio generation. Required when audio output is requested with
+	 * modalities: ["audio"]
+	 * Note: that the audio modality is only available for the gpt-4o-audio-preview model 
+	 * and is not supported for streaming completions.
+
+	 */
+	private @JsonProperty("audio") AudioParameters audio;
+
 	/**
 	 * Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they
 	 * appear in the text so far, increasing the model's likelihood to talk about new topics.
@@ -206,6 +228,8 @@ public static OpenAiChatOptions fromOptions(OpenAiChatOptions fromOptions) {
 			.withMaxTokens(fromOptions.getMaxTokens())
 			.withMaxCompletionTokens(fromOptions.getMaxCompletionTokens())
 			.withN(fromOptions.getN())
+			.withModalities(fromOptions.getModalities())
+			.withAudio(fromOptions.getAudio())
 			.withPresencePenalty(fromOptions.getPresencePenalty())
 			.withResponseFormat(fromOptions.getResponseFormat())
 			.withStreamUsage(fromOptions.getStreamUsage())
@@ -300,6 +324,22 @@ public void setN(Integer n) {
 		this.n = n;
 	}
 
+	public List<String> getModalities() {
+		return modalities;
+	}
+
+	public void setModalities(List<String> modalities) {
+		this.modalities = modalities;
+	}
+
+	public AudioParameters getAudio() {
+		return audio;
+	}
+
+	public void setAudio(AudioParameters audio) {
+		this.audio = audio;
+	}
+
 	@Override
 	public Double getPresencePenalty() {
 		return this.presencePenalty;
@@ -465,7 +505,7 @@ public int hashCode() {
 				this.maxTokens, this.maxCompletionTokens, this.n, this.presencePenalty, this.responseFormat,
 				this.streamOptions, this.seed, this.stop, this.temperature, this.topP, this.tools, this.toolChoice,
 				this.user, this.parallelToolCalls, this.functionCallbacks, this.functions, this.httpHeaders,
-				this.proxyToolCalls, this.toolContext);
+				this.proxyToolCalls, this.toolContext, this.modalities, this.audio);
 	}
 
 	@Override
@@ -493,7 +533,8 @@ public boolean equals(Object o) {
 				&& Objects.equals(this.functions, other.functions)
 				&& Objects.equals(this.httpHeaders, other.httpHeaders)
 				&& Objects.equals(this.toolContext, other.toolContext)
-				&& Objects.equals(this.proxyToolCalls, other.proxyToolCalls);
+				&& Objects.equals(this.proxyToolCalls, other.proxyToolCalls)
+				&& Objects.equals(this.modalities, other.modalities) && Objects.equals(this.audio, other.audio);
 	}
 
 	@Override
@@ -558,6 +599,16 @@ public Builder withN(Integer n) {
 			return this;
 		}
 
+		public Builder withModalities(List<String> modalities) {
+			this.options.modalities = modalities;
+			return this;
+		}
+
+		public Builder withAudio(AudioParameters audio) {
+			this.options.audio = audio;
+			return this;
+		}
+
 		public Builder withPresencePenalty(Double presencePenalty) {
 			this.options.presencePenalty = presencePenalty;
 			return this;
 
@@ -839,10 +839,10 @@ public ChatCompletionRequest(List<ChatCompletionMessage> messages, String model,
 		 * @param model ID of the model to use.
 		 * @param audio Parameters for audio output. Required when audio output is requested with outputModalities: ["audio"].
 		 */
-		public ChatCompletionRequest(List<ChatCompletionMessage> messages, String model, AudioParameters audio) {
+		public ChatCompletionRequest(List<ChatCompletionMessage> messages, String model, AudioParameters audio, boolean stream) {
 			this(messages, model, null, null, null, null, null, null,
 					null, null, null, List.of(OutputModality.AUDIO, OutputModality.TEXT), audio, null, null,
-					null, null, null, false, null, null, null,
+					null, null, null, stream, null, null, null,
 					null, null, null, null);
 		}
 
@@ -938,34 +938,34 @@ public record AudioParameters(
 			 * Specifies the voice type.
 			 */
 			public enum Voice {
-				@JsonProperty("alloy")
-				ALLOY,
-				@JsonProperty("echo")
-				ECHO,
-				@JsonProperty("fable")
-				FABLE,
-				@JsonProperty("onyx")
-				ONYX,
-				@JsonProperty("nova")
-				NOVA,
-				@JsonProperty("shimmer")
-				SHIMMER
+				/** Alloy voice */
+				@JsonProperty("alloy") ALLOY,
+				/** Echo voice */
+				@JsonProperty("echo") ECHO,
+				/** Fable voice */
+				@JsonProperty("fable") FABLE,
+				/** Onyx voice */
+				@JsonProperty("onyx") ONYX,
+				/** Nova voice */
+				@JsonProperty("nova") NOVA,
+				/** Shimmer voice */
+				@JsonProperty("shimmer") SHIMMER
 			}
 
 			/**
 			 * Specifies the output audio format.
 			 */
 			public enum AudioResponseFormat {
-				@JsonProperty("mp3")
-				MP3,
-				@JsonProperty("flac")
-				FLAC,
-				@JsonProperty("opus")
-				OPUS,
-				@JsonProperty("pcm16")
-				PCM16,
-				@JsonProperty("wav")
-				WAV
+				/** MP3 format */
+				@JsonProperty("mp3") MP3,
+				/** FLAC format */
+				@JsonProperty("flac") FLAC,
+				/** OPUS format */	
+				@JsonProperty("opus") OPUS,
+				/** PCM16 format */
+				@JsonProperty("pcm16") PCM16,
+				/** WAV format */
+				@JsonProperty("wav") WAV
 			}
 		}
 
@@ -1119,10 +1119,10 @@ public record InputAudio(// @formatter:off
 				@JsonProperty("format") Format format) {
 
 				public enum Format {
-					@JsonProperty("mp3")
-					MP3,
-					@JsonProperty("wav")
-					WAV
+					/** MP3 audio format */
+					@JsonProperty("mp3") MP3,
+					/** WAV audio format */
+					@JsonProperty("wav") WAV
 				} // @formatter:on
 			}
 
 
@@ -35,6 +35,7 @@
 import org.springframework.http.ResponseEntity;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
 
 /**
  * @author Christian Tzolov
@@ -105,7 +106,7 @@ void outputAudio() {
 				ChatCompletionRequest.AudioParameters.Voice.NOVA,
 				ChatCompletionRequest.AudioParameters.AudioResponseFormat.MP3);
 		ChatCompletionRequest chatCompletionRequest = new ChatCompletionRequest(List.of(chatCompletionMessage),
-				OpenAiApi.ChatModel.GPT_4_O_AUDIO_PREVIEW.getValue(), audioParameters);
+				OpenAiApi.ChatModel.GPT_4_O_AUDIO_PREVIEW.getValue(), audioParameters, false);
 		ResponseEntity<ChatCompletion> response = this.openAiApi.chatCompletionEntity(chatCompletionRequest);
 
 		assertThat(response).isNotNull();
@@ -119,4 +120,23 @@ void outputAudio() {
 			.containsIgnoringCase("leviosa");
 	}
 
+	@Test
+	void streamOutputAudio() {
+		ChatCompletionMessage chatCompletionMessage = new ChatCompletionMessage(
+				"What is the magic spell to make objects fly?", Role.USER);
+		ChatCompletionRequest.AudioParameters audioParameters = new ChatCompletionRequest.AudioParameters(
+				ChatCompletionRequest.AudioParameters.Voice.NOVA,
+				ChatCompletionRequest.AudioParameters.AudioResponseFormat.MP3);
+		ChatCompletionRequest chatCompletionRequest = new ChatCompletionRequest(List.of(chatCompletionMessage),
+				OpenAiApi.ChatModel.GPT_4_O_AUDIO_PREVIEW.getValue(), audioParameters, true);
+		// Flux<ChatCompletionChunk> response =
+		// this.openAiApi.chatCompletionStream(chatCompletionRequest);
+
+		// var responseList = response.collectList().block();
+
+		assertThatThrownBy(() -> this.openAiApi.chatCompletionStream(chatCompletionRequest).collectList().block())
+			.isInstanceOf(RuntimeException.class)
+			.hasMessageContaining("400 Bad Request from POST https://api.openai.com/v1/chat/completions");
+	}
+
 }