Skip to content

Commit e5fb43c

Browse files
committed
feat: specify language for AudioToText
Signed-off-by: Lukas Schaefer <lukas@lschaefer.xyz>
1 parent 5b3a8ed commit e5fb43c

File tree

3 files changed

+34
-5
lines changed

3 files changed

+34
-5
lines changed

lib/AppInfo/Application.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ class Application extends App implements IBootstrap {
7979
public const MODELS_CACHE_KEY = 'models';
8080
public const MODELS_CACHE_TTL = 60 * 30;
8181

82+
public const AUDIO_TO_TEXT_LANGUAGES = [['en', 'English'], ['zh', '中文'], ['de', 'Deutsch'], ['es', 'Español'], ['ru', 'Русский'], ['ko', '한국어'], ['fr', 'Français'], ['ja', '日本語'], ['pt', 'Português'], ['tr', 'Türkçe'], ['pl', 'Polski'], ['ca', 'Català'], ['nl', 'Nederlands'], ['ar', 'العربية'], ['sv', 'Svenska'], ['it', 'Italiano'], ['id', 'Bahasa Indonesia'], ['hi', 'हिन्दी'], ['fi', 'Suomi'], ['vi', 'Tiếng Việt'], ['he', 'עברית'], ['uk', 'Українська'], ['el', 'Ελληνικά'], ['ms', 'Bahasa Melayu'], ['cs', 'Česky'], ['ro', 'Română'], ['da', 'Dansk'], ['hu', 'Magyar'], ['ta', 'தமிழ்'], ['no', 'Norsk (bokmål / riksmål)'], ['th', 'ไทย / Phasa Thai'], ['ur', 'اردو'], ['hr', 'Hrvatski'], ['bg', 'Български'], ['lt', 'Lietuvių'], ['la', 'Latina'], ['mi', 'Māori'], ['ml', 'മലയാളം'], ['cy', 'Cymraeg'], ['sk', 'Slovenčina'], ['te', 'తెలుగు'], ['fa', 'فارسی'], ['lv', 'Latviešu'], ['bn', 'বাংলা'], ['sr', 'Српски'], ['az', 'Azərbaycanca / آذربايجان'], ['sl', 'Slovenščina'], ['kn', 'ಕನ್ನಡ'], ['et', 'Eesti'], ['mk', 'Македонски'], ['br', 'Brezhoneg'], ['eu', 'Euskara'], ['is', 'Íslenska'], ['hy', 'Հայերեն'], ['ne', 'नेपाली'], ['mn', 'Монгол'], ['bs', 'Bosanski'], ['kk', 'Қазақша'], ['sq', 'Shqip'], ['sw', 'Kiswahili'], ['gl', 'Galego'], ['mr', 'मराठी'], ['pa', 'ਪੰਜਾਬੀ / पंजाबी / پنجابي'], ['si', 'සිංහල'], ['km', 'ភាសាខ្មែរ'], ['sn', 'chiShona'], ['yo', 'Yorùbá'], ['so', 'Soomaaliga'], ['af', 'Afrikaans'], ['oc', 'Occitan'], ['ka', 'ქართული'], ['be', 'Беларуская'], ['tg', 'Тоҷикӣ'], ['sd', 'सिनधि'], ['gu', 'ગુજરાતી'], ['am', 'አማርኛ'], ['yi', 'ייִדיש'], ['lo', 'ລາວ / Pha xa lao'], ['uz', 'Ўзбек'], ['fo', 'Føroyskt'], ['ht', 'Krèyol ayisyen'], ['ps', 'پښتو'], ['tk', 'Туркмен / تركمن'], ['nn', 'Norsk (nynorsk)'], ['mt', 'bil-Malti'], ['sa', 'संस्कृतम्'], ['lb', 'Lëtzebuergesch'], ['my', 'Myanmasa'], ['bo', 'བོད་ཡིག / Bod skad'], ['tl', 'Tagalog'], ['mg', 'Malagasy'], ['as', 'অসমীয়া'], ['tt', 'Tatarça'], ['haw', 'ʻŌlelo Hawaiʻi'], ['ln', 'Lingála'], ['ha', 'هَوُسَ'], ['ba', 'Башҡорт'], ['jw', 'ꦧꦱꦗꦮ'], ['su', 'Basa Sunda'], ['yue', '粤语']];
83+
8284
private IAppConfig $appConfig;
8385

8486
public function __construct(array $urlParams = []) {

lib/Service/OpenAiAPIService.php

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,8 @@ public function transcribeBase64Mp3(
683683
* @param string|null $userId
684684
* @param File $file
685685
* @param bool $translate
686+
* @param string $model
687+
* @param string $language
686688
* @return string
687689
* @throws Exception
688690
*/
@@ -691,9 +693,10 @@ public function transcribeFile(
691693
File $file,
692694
bool $translate = false,
693695
string $model = Application::DEFAULT_MODEL_ID,
696+
string $language = 'detect_language',
694697
): string {
695698
try {
696-
$transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model);
699+
$transcriptionResponse = $this->transcribe($userId, $file->getContent(), $translate, $model, $language);
697700
} catch (NotPermittedException|LockedException|GenericFileException $e) {
698701
$this->logger->warning('Could not read audio file: ' . $file->getPath() . '. Error: ' . $e->getMessage(), ['app' => Application::APP_ID]);
699702
throw new Exception($this->l10n->t('Could not read audio file.'), Http::STATUS_INTERNAL_SERVER_ERROR);
@@ -707,6 +710,7 @@ public function transcribeFile(
707710
* @param string $audioFileContent
708711
* @param bool $translate
709712
* @param string $model
713+
* @param string $language
710714
* @return string
711715
* @throws Exception
712716
*/
@@ -715,6 +719,7 @@ public function transcribe(
715719
string $audioFileContent,
716720
bool $translate = true,
717721
string $model = Application::DEFAULT_MODEL_ID,
722+
string $language = 'detect_language',
718723
): string {
719724
if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TRANSCRIPTION)) {
720725
throw new Exception($this->l10n->t('Audio transcription quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
@@ -730,6 +735,9 @@ public function transcribe(
730735
'response_format' => 'verbose_json',
731736
// Verbose needed for extraction of audio duration
732737
];
738+
if ($language !== 'detect_language') {
739+
$params['language'] = $language;
740+
}
733741
$endpoint = $translate ? 'audio/translations' : 'audio/transcriptions';
734742
$contentType = 'multipart/form-data';
735743

lib/TaskProcessing/AudioToTextProvider.php

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@
1414
use OCA\OpenAi\Service\OpenAiAPIService;
1515
use OCP\Files\File;
1616
use OCP\IAppConfig;
17+
use OCP\IL10N;
18+
use OCP\L10N\IFactory;
19+
use OCP\TaskProcessing\EShapeType;
1720
use OCP\TaskProcessing\ISynchronousProvider;
21+
use OCP\TaskProcessing\ShapeDescriptor;
22+
use OCP\TaskProcessing\ShapeEnumValue;
1823
use OCP\TaskProcessing\TaskTypes\AudioToText;
1924
use Psr\Log\LoggerInterface;
2025
use RuntimeException;
@@ -25,6 +30,8 @@ public function __construct(
2530
private OpenAiAPIService $openAiAPIService,
2631
private LoggerInterface $logger,
2732
private IAppConfig $appConfig,
33+
private IFactory $l10nFactory,
34+
private IL10N $l,
2835
) {
2936
}
3037

@@ -53,15 +60,23 @@ public function getInputShapeDefaults(): array {
5360
}
5461

5562
public function getOptionalInputShape(): array {
56-
return [];
63+
return ['language' => new ShapeDescriptor(
64+
$this->l->t('Language'),
65+
$this->l->t('The language of the audio file'),
66+
EShapeType::Enum
67+
)];
5768
}
5869

5970
public function getOptionalInputShapeEnumValues(): array {
60-
return [];
71+
$languageEnumValues = array_map(static function (array $language) {
72+
return new ShapeEnumValue($language[1], $language[0]);
73+
}, Application::AUDIO_TO_TEXT_LANGUAGES);
74+
$detectLanguageEnumValue = new ShapeEnumValue($this->l->t('Detect language'), 'detect_language');
75+
return ['language' => array_merge([$detectLanguageEnumValue], $languageEnumValues)];
6176
}
6277

6378
public function getOptionalInputShapeDefaults(): array {
64-
return [];
79+
return ['language' => 'detect_language'];
6580
}
6681

6782
public function getOutputShapeEnumValues(): array {
@@ -81,11 +96,15 @@ public function process(?string $userId, array $input, callable $reportProgress)
8196
throw new RuntimeException('Invalid input file');
8297
}
8398
$inputFile = $input['input'];
99+
$language = $input['language'] ?? 'detect_language';
100+
if (!is_string($language)) {
101+
throw new RuntimeException('Invalid language');
102+
}
84103

85104
$model = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
86105

87106
try {
88-
$transcription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $model);
107+
$transcription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $model, $language);
89108
return ['output' => $transcription];
90109
} catch (Exception $e) {
91110
$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);

0 commit comments

Comments
 (0)