@@ -127,7 +127,13 @@ public function getOutputShapeEnumValues(): array {
127127 }
128128
129129 public function getOptionalOutputShape (): array {
130- return [];
130+ return [
131+ 'audio_id ' => new ShapeDescriptor (
132+ $ this ->l ->t ('Remote audio ID ' ),
133+ $ this ->l ->t ('The ID of the audio response returned by the remote service ' ),
134+ EShapeType::Text
135+ ),
136+ ];
131137 }
132138
133139 public function getOptionalOutputShapeEnumValues (): array {
@@ -187,58 +193,21 @@ public function process(?string $userId, array $input, callable $reportProgress)
187193 $ sttModel = $ this ->appConfig ->getValueString (Application::APP_ID , 'default_stt_model_id ' , Application::DEFAULT_MODEL_ID ) ?: Application::DEFAULT_MODEL_ID ;
188194 $ serviceName = $ this ->appConfig ->getValueString (Application::APP_ID , 'service_name ' ) ?: Application::APP_ID ;
189195
190- /////////////// Using the chat API if connected to OpenAI
196+ // Using the chat API if connected to OpenAI
191197 // there is an issue if the history mostly contains text, the model will answer text even if we add the audio modality
192- /*
193198 if ($ this ->openAiAPIService ->isUsingOpenAi ()) {
194199 return $ this ->oneStep ($ userId , $ systemPrompt , $ inputFile , $ history , $ outputVoice , $ sttModel , $ llmModel , $ ttsModel , $ speed , $ serviceName );
195200 }
196- */
197-
198- //////////////// 3 steps: STT -> LLM -> TTS
199- // speech to text
200- try {
201- $ inputTranscription = $ this ->openAiAPIService ->transcribeFile ($ userId , $ inputFile , false , $ sttModel );
202- } catch (Exception $ e ) {
203- $ this ->logger ->warning ($ serviceName . ' transcription failed with: ' . $ e ->getMessage (), ['exception ' => $ e ]);
204- throw new RuntimeException ($ serviceName . ' transcription failed with: ' . $ e ->getMessage ());
205- }
206-
207- // free prompt
208- try {
209- $ completion = $ this ->openAiAPIService ->createChatCompletion ($ userId , $ llmModel , $ inputTranscription , $ systemPrompt , $ history , 1 , 1000 );
210- $ completion = $ completion ['messages ' ];
211- } catch (Exception $ e ) {
212- throw new RuntimeException ($ serviceName . ' chat completion request failed: ' . $ e ->getMessage ());
213- }
214- if (count ($ completion ) === 0 ) {
215- throw new RuntimeException ('No completion in ' . $ serviceName . ' response. ' );
216- }
217- $ llmResult = array_pop ($ completion );
218-
219- // text to speech
220- try {
221- $ apiResponse = $ this ->openAiAPIService ->requestSpeechCreation ($ userId , $ llmResult , $ ttsModel , $ outputVoice , $ speed );
222201
223- if (!isset ($ apiResponse ['body ' ])) {
224- $ this ->logger ->warning ($ serviceName . ' text to speech generation failed: no speech returned ' );
225- throw new RuntimeException ($ serviceName . ' text to speech generation failed: no speech returned ' );
226- }
227- return [
228- 'output ' => $ apiResponse ['body ' ],
229- 'output_transcript ' => $ llmResult ,
230- 'input_transcript ' => $ inputTranscription ,
231- ];
232- } catch (\Exception $ e ) {
233- $ this ->logger ->warning ($ serviceName . ' text to speech generation failed with: ' . $ e ->getMessage (), ['exception ' => $ e ]);
234- throw new RuntimeException ($ serviceName . ' text to speech generation failed with: ' . $ e ->getMessage ());
235- }
202+ // 3 steps: STT -> LLM -> TTS
203+ return $ this ->threeSteps ($ userId , $ systemPrompt , $ inputFile , $ history , $ outputVoice , $ sttModel , $ llmModel , $ ttsModel , $ speed , $ serviceName );
236204 }
237205
238206 private function oneStep (
239207 ?string $ userId , string $ systemPrompt , File $ inputFile , array $ history , string $ outputVoice ,
240- string $ sttModel , string $ llmModel , string $ ttsModel , float $ speed , string $ serviceName
208+ string $ sttModel , string $ llmModel , string $ ttsModel , float $ speed , string $ serviceName,
241209 ): array {
210+ $ result = [];
242211 $ b64Audio = base64_encode ($ inputFile ->getContent ());
243212 $ extraParams = [
244213 'modalities ' => ['text ' , 'audio ' ],
@@ -269,11 +238,12 @@ private function oneStep(
269238 } else {
270239 $ output = base64_decode ($ message ['audio ' ]['data ' ]);
271240 $ textResponse = $ message ['audio ' ]['transcript ' ];
241+ if (isset ($ message ['audio ' ]['id ' ])) {
242+ $ result ['audio_id ' ] = $ message ['audio ' ]['id ' ];
243+ }
272244 }
273- $ result = [
274- 'output ' => $ output ,
275- 'output_transcript ' => $ textResponse ,
276- ];
245+ $ result ['output ' ] = $ output ;
246+ $ result ['output_transcript ' ] = $ textResponse ;
277247
278248 // we still want the input transcription
279249 try {
@@ -286,4 +256,47 @@ private function oneStep(
286256
287257 return $ result ;
288258 }
259+
260+ private function threeSteps (
261+ ?string $ userId , string $ systemPrompt , File $ inputFile , array $ history , string $ outputVoice ,
262+ string $ sttModel , string $ llmModel , string $ ttsModel , float $ speed , string $ serviceName ,
263+ ): array {
264+ // speech to text
265+ try {
266+ $ inputTranscription = $ this ->openAiAPIService ->transcribeFile ($ userId , $ inputFile , false , $ sttModel );
267+ } catch (Exception $ e ) {
268+ $ this ->logger ->warning ($ serviceName . ' transcription failed with: ' . $ e ->getMessage (), ['exception ' => $ e ]);
269+ throw new RuntimeException ($ serviceName . ' transcription failed with: ' . $ e ->getMessage ());
270+ }
271+
272+ // free prompt
273+ try {
274+ $ completion = $ this ->openAiAPIService ->createChatCompletion ($ userId , $ llmModel , $ inputTranscription , $ systemPrompt , $ history , 1 , 1000 );
275+ $ completion = $ completion ['messages ' ];
276+ } catch (Exception $ e ) {
277+ throw new RuntimeException ($ serviceName . ' chat completion request failed: ' . $ e ->getMessage ());
278+ }
279+ if (count ($ completion ) === 0 ) {
280+ throw new RuntimeException ('No completion in ' . $ serviceName . ' response. ' );
281+ }
282+ $ llmResult = array_pop ($ completion );
283+
284+ // text to speech
285+ try {
286+ $ apiResponse = $ this ->openAiAPIService ->requestSpeechCreation ($ userId , $ llmResult , $ ttsModel , $ outputVoice , $ speed );
287+
288+ if (!isset ($ apiResponse ['body ' ])) {
289+ $ this ->logger ->warning ($ serviceName . ' text to speech generation failed: no speech returned ' );
290+ throw new RuntimeException ($ serviceName . ' text to speech generation failed: no speech returned ' );
291+ }
292+ return [
293+ 'output ' => $ apiResponse ['body ' ],
294+ 'output_transcript ' => $ llmResult ,
295+ 'input_transcript ' => $ inputTranscription ,
296+ ];
297+ } catch (\Exception $ e ) {
298+ $ this ->logger ->warning ($ serviceName . ' text to speech generation failed with: ' . $ e ->getMessage (), ['exception ' => $ e ]);
299+ throw new RuntimeException ($ serviceName . ' text to speech generation failed with: ' . $ e ->getMessage ());
300+ }
301+ }
289302}
0 commit comments