Skip to content

Commit 57c053a

Browse files
authored
Assume less about whisper vocab (OpenNMT#2000)
1 parent 4414cb7 commit 57c053a

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

src/models/whisper.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,12 @@ namespace ctranslate2 {
6969
_no_speech_id = vocabulary.to_id("<|nospeech|>");
7070
if (_no_speech_id == vocabulary.unk_id())
7171
_no_speech_id = vocabulary.to_id("<|nocaptions|>");
72-
_is_multilingual = vocabulary.size() >= 51865;
72+
_is_multilingual = vocabulary.to_id("") != vocabulary.unk_id();
7373
_n_mels = _encoder->input_size();
74-
_num_languages = vocabulary.size() - 51765 - (_is_multilingual ? 1 : 0);
74+
// vocab: text tokens..., <|endoftext|>, <|startoftranscript|>,
75+
// lang tokens..., <|translate|>, <|transcribe|>, <|startoflm|>,
76+
// <|startofprev|>, <|nospeech|>, <|notimestamps|>, time tokens...
77+
_num_languages = _no_speech_id - _sot_id - 5;
7578
}
7679

7780
StorageView WhisperReplica::encode(StorageView features, const bool to_cpu) {

0 commit comments

Comments
 (0)