Skip to content

Commit f3a6a09

Browse files
committed
feat(audio-translation): add audio translation task type and provider, factorize translation logic in a service
use the correct user language for text translations happening in the task Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
1 parent dfd4567 commit f3a6a09

File tree

7 files changed

+432
-119
lines changed

7 files changed

+432
-119
lines changed

composer.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/AppInfo/Application.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
use OCA\OpenAi\Notification\Notifier;
1212
use OCA\OpenAi\OldProcessing\Translation\TranslationProvider as OldTranslationProvider;
1313
use OCA\OpenAi\TaskProcessing\AudioToAudioChatProvider;
14+
use OCA\OpenAi\TaskProcessing\AudioToAudioTranslateProvider;
15+
use OCA\OpenAi\TaskProcessing\AudioToAudioTranslateTaskType;
1416
use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
1517
use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
1618
use OCA\OpenAi\TaskProcessing\ChangeToneTaskType;
@@ -104,6 +106,8 @@ public function register(IRegistrationContext $context): void {
104106
// Task processing
105107
if ($this->appConfig->getValueString(Application::APP_ID, 'translation_provider_enabled', '1') === '1') {
106108
$context->registerTaskProcessingProvider(TranslateProvider::class);
109+
$context->registerTaskProcessingTaskType(AudioToAudioTranslateTaskType::class);
110+
$context->registerTaskProcessingProvider(AudioToAudioTranslateProvider::class);
107111
}
108112
if ($this->appConfig->getValueString(Application::APP_ID, 'stt_provider_enabled', '1') === '1') {
109113
$context->registerTaskProcessingProvider(AudioToTextProvider::class);

lib/Service/TranslationService.php

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
<?php
2+
3+
/**
4+
* SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
5+
* SPDX-License-Identifier: AGPL-3.0-or-later
6+
*/
7+
8+
namespace OCA\OpenAi\Service;
9+
10+
use OCP\ICacheFactory;
11+
use OCP\L10N\IFactory;
12+
use Psr\Log\LoggerInterface;
13+
14+
class TranslationService {
15+
public const SYSTEM_PROMPT = 'You are a translations expert that ONLY outputs a valid JSON with the translated text in the following format: { "translation": "<translated text>" } .';
16+
public const JSON_RESPONSE_FORMAT = [
17+
'response_format' => [
18+
'type' => 'json_schema',
19+
'json_schema' => [
20+
'name' => 'TranslationResponse',
21+
'description' => 'A JSON object containing the translated text',
22+
'strict' => true,
23+
'schema' => [
24+
'type' => 'object',
25+
'properties' => [
26+
'translation' => [
27+
'type' => 'string',
28+
'description' => 'The translated text',
29+
],
30+
],
31+
'required' => [ 'translation' ],
32+
'additionalProperties' => false,
33+
],
34+
],
35+
],
36+
];
37+
38+
public function __construct(
39+
private OpenAiSettingsService $openAiSettingsService,
40+
private LoggerInterface $logger,
41+
private OpenAiAPIService $openAiAPIService,
42+
private ChunkService $chunkService,
43+
private ICacheFactory $cacheFactory,
44+
private IFactory $l10nFactory,
45+
) {
46+
}
47+
48+
private function getCoreLanguagesByCode(): array {
49+
$coreL = $this->l10nFactory->getLanguages();
50+
$coreLanguages = array_reduce(array_merge($coreL['commonLanguages'], $coreL['otherLanguages']), function ($carry, $val) {
51+
$carry[$val['code']] = $val['name'];
52+
return $carry;
53+
});
54+
return $coreLanguages;
55+
}
56+
57+
public function translate(
58+
string $inputText, string $sourceLanguageCode, string $targetLanguageCode, string $model, int $maxTokens,
59+
?string $userId, ?callable $reportProgress = null,
60+
): string {
61+
$chunks = $this->chunkService->chunkSplitPrompt($inputText, true, $maxTokens);
62+
$translation = '';
63+
$increase = 1.0 / (float)count($chunks);
64+
$progress = 0.0;
65+
$coreLanguages = $this->getCoreLanguagesByCode();
66+
67+
$toLanguage = $coreLanguages[$targetLanguageCode] ?? $targetLanguageCode;
68+
69+
if ($sourceLanguageCode !== 'detect_language') {
70+
$fromLanguage = $coreLanguages[$sourceLanguageCode] ?? $sourceLanguageCode;
71+
$promptStart = 'Translate the following text from ' . $fromLanguage . ' to ' . $toLanguage . ': ';
72+
} else {
73+
$promptStart = 'Translate the following text to ' . $toLanguage . ': ';
74+
}
75+
76+
foreach ($chunks as $chunk) {
77+
$progress += $increase;
78+
$cacheKey = $sourceLanguageCode . '/' . $targetLanguageCode . '/' . md5($chunk);
79+
80+
$cache = $this->cacheFactory->createDistributed('integration_openai');
81+
if ($cached = $cache->get($cacheKey)) {
82+
$this->logger->debug('Using cached translation', ['cached' => $cached, 'cacheKey' => $cacheKey]);
83+
$translation .= $cached;
84+
if ($reportProgress !== null) {
85+
$reportProgress($progress);
86+
}
87+
continue;
88+
}
89+
$prompt = $promptStart . PHP_EOL . PHP_EOL . $chunk;
90+
91+
if ($this->openAiAPIService->isUsingOpenAi() || $this->openAiSettingsService->getChatEndpointEnabled()) {
92+
$completionsObj = $this->openAiAPIService->createChatCompletion(
93+
$userId, $model, $prompt, self::SYSTEM_PROMPT, null, 1, $maxTokens, self::JSON_RESPONSE_FORMAT
94+
);
95+
$completions = $completionsObj['messages'];
96+
} else {
97+
$completions = $this->openAiAPIService->createCompletion(
98+
$userId, $prompt . PHP_EOL . self::SYSTEM_PROMPT . PHP_EOL . PHP_EOL, 1, $model, $maxTokens
99+
);
100+
}
101+
102+
if ($reportProgress !== null) {
103+
$reportProgress($progress);
104+
}
105+
106+
if (count($completions) === 0) {
107+
$this->logger->error('Empty translation response received for chunk');
108+
continue;
109+
}
110+
111+
$completion = array_pop($completions);
112+
$decodedCompletion = json_decode($completion, true);
113+
if (
114+
!isset($decodedCompletion['translation'])
115+
|| !is_string($decodedCompletion['translation'])
116+
|| empty($decodedCompletion['translation'])
117+
) {
118+
$this->logger->error('Invalid translation response received for chunk', ['response' => $completion]);
119+
continue;
120+
}
121+
$translation .= $decodedCompletion['translation'];
122+
$cache->set($cacheKey, $decodedCompletion['translation']);
123+
continue;
124+
}
125+
return $translation;
126+
}
127+
}
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
/**
6+
* SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
7+
* SPDX-License-Identifier: AGPL-3.0-or-later
8+
*/
9+
10+
namespace OCA\OpenAi\TaskProcessing;
11+
12+
use Exception;
13+
use OCA\OpenAi\AppInfo\Application;
14+
use OCA\OpenAi\Service\OpenAiAPIService;
15+
use OCA\OpenAi\Service\OpenAiSettingsService;
16+
use OCA\OpenAi\Service\TranslationService;
17+
use OCA\OpenAi\Service\WatermarkingService;
18+
use OCP\Files\File;
19+
use OCP\IAppConfig;
20+
use OCP\IL10N;
21+
use OCP\IUserManager;
22+
use OCP\L10N\IFactory;
23+
use OCP\TaskProcessing\Exception\ProcessingException;
24+
use OCP\TaskProcessing\ISynchronousWatermarkingProvider;
25+
use OCP\TaskProcessing\ShapeEnumValue;
26+
use Psr\Log\LoggerInterface;
27+
28+
class AudioToAudioTranslateProvider implements ISynchronousWatermarkingProvider {
29+
30+
public function __construct(
31+
private OpenAiAPIService $openAiAPIService,
32+
private TranslationService $translationService,
33+
private OpenAiSettingsService $openAiSettingsService,
34+
private WatermarkingService $watermarkingService,
35+
private LoggerInterface $logger,
36+
private IFactory $l10nFactory,
37+
private IL10N $l,
38+
private IAppConfig $appConfig,
39+
private IUserManager $userManager,
40+
) {
41+
}
42+
43+
public function getId(): string {
44+
return Application::APP_ID . '-audio2audio:translate';
45+
}
46+
47+
public function getName(): string {
48+
return $this->openAiAPIService->getServiceName(Application::SERVICE_TYPE_STT);
49+
}
50+
51+
public function getTaskTypeId(): string {
52+
return AudioToAudioTranslateTaskType::ID;
53+
}
54+
55+
public function getExpectedRuntime(): int {
56+
return 60;
57+
}
58+
59+
public function getInputShapeEnumValues(): array {
60+
$coreL = $this->l10nFactory->getLanguages();
61+
$languages = array_merge($coreL['commonLanguages'], $coreL['otherLanguages']);
62+
$languageEnumValues = array_map(static function (array $language) {
63+
return new ShapeEnumValue($language['name'], $language['code']);
64+
}, $languages);
65+
$detectLanguageEnumValue = new ShapeEnumValue($this->l->t('Detect language'), 'detect_language');
66+
return [
67+
'origin_language' => array_merge([$detectLanguageEnumValue], $languageEnumValues),
68+
'target_language' => $languageEnumValues,
69+
];
70+
}
71+
72+
public function getInputShapeDefaults(): array {
73+
return [
74+
'origin_language' => 'detect_language',
75+
];
76+
}
77+
78+
79+
public function getOptionalInputShape(): array {
80+
return [];
81+
}
82+
83+
public function getOptionalInputShapeEnumValues(): array {
84+
return [];
85+
}
86+
87+
public function getOptionalInputShapeDefaults(): array {
88+
return [];
89+
}
90+
91+
public function getOutputShapeEnumValues(): array {
92+
return [];
93+
}
94+
95+
public function getOptionalOutputShape(): array {
96+
return [];
97+
}
98+
99+
public function getOptionalOutputShapeEnumValues(): array {
100+
return [];
101+
}
102+
103+
public function process(?string $userId, array $input, callable $reportProgress, bool $includeWatermark = true): array {
104+
if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) {
105+
throw new ProcessingException('Invalid input file');
106+
}
107+
$inputFile = $input['input'];
108+
109+
if (!isset($input['origin_language']) || !is_string($input['origin_language'])) {
110+
throw new ProcessingException('Invalid origin_language input');
111+
}
112+
if (!isset($input['target_language']) || !is_string($input['target_language'])) {
113+
throw new ProcessingException('Invalid target_language input');
114+
}
115+
116+
// STT
117+
$sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID, lazy: true) ?: Application::DEFAULT_MODEL_ID;
118+
try {
119+
$transcription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel, $input['origin_language']);
120+
} catch (Exception $e) {
121+
$this->logger->warning('Transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
122+
throw new ProcessingException(
123+
'Transcription failed with: ' . $e->getMessage(),
124+
$e->getCode(),
125+
$e,
126+
);
127+
}
128+
129+
$reportProgress(0.3);
130+
131+
// translate
132+
$completionModel = $this->openAiAPIService->isUsingOpenAi()
133+
? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID, lazy: true) ?: Application::DEFAULT_MODEL_ID)
134+
: $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', lazy: true);
135+
$maxTokens = $this->openAiSettingsService->getMaxTokens();
136+
137+
try {
138+
$translatedText = $this->translationService->translate(
139+
$transcription, $input['origin_language'], $input['target_language'], $completionModel, $maxTokens, $userId,
140+
);
141+
142+
if (empty($translatedText)) {
143+
throw new ProcessingException("Empty translation result from {$input['origin_language']} to {$input['target_language']}");
144+
}
145+
} catch (Exception $e) {
146+
throw new ProcessingException(
147+
"Failed to translate from {$input['origin_language']} to {$input['target_language']}: {$e->getMessage()}",
148+
$e->getCode(),
149+
$e,
150+
);
151+
}
152+
153+
$reportProgress(0.6);
154+
155+
// TTS
156+
$ttsPrompt = $translatedText;
157+
if ($includeWatermark) {
158+
if ($userId !== null) {
159+
$user = $this->userManager->getExistingUser($userId);
160+
$lang = $this->l10nFactory->getUserLanguage($user);
161+
$l = $this->l10nFactory->get(Application::APP_ID, $lang);
162+
$ttsPrompt .= "\n\n" . $l->t('This was generated using Artificial Intelligence.');
163+
} else {
164+
$ttsPrompt .= "\n\n" . $this->l->t('This was generated using Artificial Intelligence.');
165+
}
166+
}
167+
$ttsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id', Application::DEFAULT_SPEECH_MODEL_ID, lazy: true) ?: Application::DEFAULT_SPEECH_MODEL_ID;
168+
$voice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE, lazy: true) ?: Application::DEFAULT_SPEECH_VOICE;
169+
$speed = 1;
170+
try {
171+
$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $ttsPrompt, $ttsModel, $voice, $speed);
172+
173+
if (!isset($apiResponse['body'])) {
174+
$this->logger->warning('Text to speech generation failed: no speech returned');
175+
throw new ProcessingException('Text to speech generation failed: no speech returned');
176+
}
177+
$translatedAudio = $includeWatermark ? $this->watermarkingService->markAudio($apiResponse['body']) : $apiResponse['body'];
178+
} catch (\Exception $e) {
179+
$this->logger->warning('Text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
180+
throw new ProcessingException(
181+
'Text to speech generation failed with: ' . $e->getMessage(),
182+
$e->getCode(),
183+
$e,
184+
);
185+
}
186+
187+
$reportProgress(1.0);
188+
189+
// Translation
190+
return [
191+
'audio_output' => $translatedAudio,
192+
'text_output' => $translatedText,
193+
];
194+
}
195+
}

0 commit comments

Comments
 (0)