File tree Expand file tree Collapse file tree 3 files changed +20
-18
lines changed Expand file tree Collapse file tree 3 files changed +20
-18
lines changed Original file line number Diff line number Diff line change 439
439
'openai/whisper-medium.en' ,
440
440
'openai/whisper-large' ,
441
441
'openai/whisper-large-v2' ,
442
-
443
- # TODO: add these models
444
- # https://github.com/huggingface/transformers/issues/26043
445
- # 'NbAiLab/nb-whisper-tiny-beta',
446
- # 'NbAiLab/nb-whisper-base-beta',
447
- # 'NbAiLab/nb-whisper-small-beta',
448
- # 'NbAiLab/nb-whisper-medium-beta',
449
- # 'NbAiLab/nb-whisper-large-beta',
442
+ 'NbAiLab/nb-whisper-tiny-beta' ,
443
+ 'NbAiLab/nb-whisper-base-beta' ,
444
+ 'NbAiLab/nb-whisper-small-beta' ,
445
+ 'NbAiLab/nb-whisper-medium-beta' ,
446
+ 'NbAiLab/nb-whisper-large-beta' ,
450
447
],
451
448
'xlm' : [
452
449
'xlm-clm-ende-1024' ,
Original file line number Diff line number Diff line change @@ -1229,19 +1229,18 @@ class ByteLevelPreTokenizer extends PreTokenizer {
1229
1229
* @returns {string[] } An array of tokens.
1230
1230
*/
1231
1231
pre_tokenize_text ( text ) {
1232
+ // Add a leading space if the option is enabled
1233
+ if ( this . add_prefix_space && ! text . startsWith ( ' ' ) ) {
1234
+ text = ' ' + text ;
1235
+ }
1236
+
1232
1237
// Split on whitespace and punctuation
1233
1238
let tokens = this . use_regex ? ( text . match ( this . pattern ) || [ ] ) : [ text ] ;
1234
1239
1235
- return tokens . map ( token => {
1236
- if ( this . add_prefix_space && ! token . startsWith ( ' ' ) ) {
1237
- token = ' ' + token ;
1238
- }
1239
-
1240
- // Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
1241
- token = Array . from ( this . text_encoder . encode ( token ) , byte => this . byte_encoder [ byte ] ) . join ( '' ) ;
1242
-
1243
- return token ;
1244
- } ) ;
1240
+ // Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
1241
+ return tokens . map (
1242
+ token => Array . from ( this . text_encoder . encode ( token ) , byte => this . byte_encoder [ byte ] ) . join ( '' )
1243
+ ) ;
1245
1244
}
1246
1245
}
1247
1246
Original file line number Diff line number Diff line change @@ -128,6 +128,12 @@ def generate_tokenizer_tests():
128
128
# means the model does not use a tokenizer (e.g., vision models)
129
129
continue
130
130
131
+ try :
132
+ # Disable dropout, if the model allows it
133
+ tokenizer .backend_tokenizer .model .dropout = 0
134
+ except AttributeError :
135
+ pass
136
+
131
137
tokenizer_results = []
132
138
133
139
shared_texts = TOKENIZER_TEST_DATA ["shared" ]
You can’t perform that action at this time.
0 commit comments