Skip to content

Commit 720d619

Browse files
committed
try to return the remote audio ID when using the chat endpoint with a multimodal model
Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
1 parent e161242 commit 720d619

File tree

2 files changed

+61
-47
lines changed

2 files changed

+61
-47
lines changed

lib/TaskProcessing/AudioToAudioChatProvider.php

Lines changed: 60 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,13 @@ public function getOutputShapeEnumValues(): array {
127127
}
128128

129129
public function getOptionalOutputShape(): array {
130-
return [];
130+
return [
131+
'audio_id' => new ShapeDescriptor(
132+
$this->l->t('Remote audio ID'),
133+
$this->l->t('The ID of the audio response returned by the remote service'),
134+
EShapeType::Text
135+
),
136+
];
131137
}
132138

133139
public function getOptionalOutputShapeEnumValues(): array {
@@ -187,58 +193,21 @@ public function process(?string $userId, array $input, callable $reportProgress)
187193
$sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
188194
$serviceName = $this->appConfig->getValueString(Application::APP_ID, 'service_name') ?: Application::APP_ID;
189195

190-
/////////////// Using the chat API if connected to OpenAI
196+
// Using the chat API if connected to OpenAI
191197
// there is an issue if the history mostly contains text, the model will answer text even if we add the audio modality
192-
/*
193198
if ($this->openAiAPIService->isUsingOpenAi()) {
194199
return $this->oneStep($userId, $systemPrompt, $inputFile, $history, $outputVoice, $sttModel, $llmModel, $ttsModel, $speed, $serviceName);
195200
}
196-
*/
197-
198-
//////////////// 3 steps: STT -> LLM -> TTS
199-
// speech to text
200-
try {
201-
$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
202-
} catch (Exception $e) {
203-
$this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
204-
throw new RuntimeException($serviceName . ' transcription failed with: ' . $e->getMessage());
205-
}
206-
207-
// free prompt
208-
try {
209-
$completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000);
210-
$completion = $completion['messages'];
211-
} catch (Exception $e) {
212-
throw new RuntimeException($serviceName . ' chat completion request failed: ' . $e->getMessage());
213-
}
214-
if (count($completion) === 0) {
215-
throw new RuntimeException('No completion in ' . $serviceName . ' response.');
216-
}
217-
$llmResult = array_pop($completion);
218-
219-
// text to speech
220-
try {
221-
$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed);
222201

223-
if (!isset($apiResponse['body'])) {
224-
$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
225-
throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
226-
}
227-
return [
228-
'output' => $apiResponse['body'],
229-
'output_transcript' => $llmResult,
230-
'input_transcript' => $inputTranscription,
231-
];
232-
} catch (\Exception $e) {
233-
$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
234-
throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
235-
}
202+
// 3 steps: STT -> LLM -> TTS
203+
return $this->threeSteps($userId, $systemPrompt, $inputFile, $history, $outputVoice, $sttModel, $llmModel, $ttsModel, $speed, $serviceName);
236204
}
237205

238206
private function oneStep(
239207
?string $userId, string $systemPrompt, File $inputFile, array $history, string $outputVoice,
240-
string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName
208+
string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName,
241209
): array {
210+
$result = [];
242211
$b64Audio = base64_encode($inputFile->getContent());
243212
$extraParams = [
244213
'modalities' => ['text', 'audio'],
@@ -269,11 +238,12 @@ private function oneStep(
269238
} else {
270239
$output = base64_decode($message['audio']['data']);
271240
$textResponse = $message['audio']['transcript'];
241+
if (isset($message['audio']['id'])) {
242+
$result['audio_id'] = $message['audio']['id'];
243+
}
272244
}
273-
$result = [
274-
'output' => $output,
275-
'output_transcript' => $textResponse,
276-
];
245+
$result['output'] = $output;
246+
$result['output_transcript'] = $textResponse;
277247

278248
// we still want the input transcription
279249
try {
@@ -286,4 +256,47 @@ private function oneStep(
286256

287257
return $result;
288258
}
259+
260+
private function threeSteps(
261+
?string $userId, string $systemPrompt, File $inputFile, array $history, string $outputVoice,
262+
string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName,
263+
): array {
264+
// speech to text
265+
try {
266+
$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
267+
} catch (Exception $e) {
268+
$this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
269+
throw new RuntimeException($serviceName . ' transcription failed with: ' . $e->getMessage());
270+
}
271+
272+
// free prompt
273+
try {
274+
$completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000);
275+
$completion = $completion['messages'];
276+
} catch (Exception $e) {
277+
throw new RuntimeException($serviceName . ' chat completion request failed: ' . $e->getMessage());
278+
}
279+
if (count($completion) === 0) {
280+
throw new RuntimeException('No completion in ' . $serviceName . ' response.');
281+
}
282+
$llmResult = array_pop($completion);
283+
284+
// text to speech
285+
try {
286+
$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed);
287+
288+
if (!isset($apiResponse['body'])) {
289+
$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
290+
throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
291+
}
292+
return [
293+
'output' => $apiResponse['body'],
294+
'output_transcript' => $llmResult,
295+
'input_transcript' => $inputTranscription,
296+
];
297+
} catch (\Exception $e) {
298+
$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
299+
throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
300+
}
301+
}
289302
}

psalm.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
<referencedClass name="OCP\TaskProcessing\EShapeType" />
4040
<referencedClass name="OCP\TaskProcessing\TaskTypes\TextToTextProofread" />
4141
<referencedClass name="OCP\TaskProcessing\TaskTypes\TextToTextChatWithTools" />
42+
<referencedClass name="OCP\TaskProcessing\TaskTypes\AudioToAudioChat" />
4243
</errorLevel>
4344
</UndefinedClass>
4445
<UndefinedDocblockClass>

0 commit comments

Comments
 (0)