1- """Model definitions and application constants."""
1+ from collections import OrderedDict
2+
3+ _MODEL_SPECS = [
4+ ("Whisper large-v3 turbo" , "whisper-large-v3-turbo" , "float32" , 155 , 4 , "3.2 GB" ),
5+ ("Whisper large-v3 turbo" , "whisper-large-v3-turbo" , "bfloat16" , 160 , 4 , "3.0 GB" ),
6+ ("Whisper large-v3 turbo" , "whisper-large-v3-turbo" , "float16" , 165 , 4 , "2.8 GB" ),
7+ ("Distil Whisper large-v3" , "distil-whisper-large-v3" , "float32" , 160 , 4 , "3.0 GB" ),
8+ ("Distil Whisper large-v3" , "distil-whisper-large-v3" , "bfloat16" , 160 , 4 , "3.0 GB" ),
9+ ("Distil Whisper large-v3" , "distil-whisper-large-v3" , "float16" , 160 , 4 , "3.0 GB" ),
10+ ("Whisper large-v3" , "whisper-large-v3" , "float32" , 85 , 2 , "5.5 GB" ),
11+ ("Whisper large-v3" , "whisper-large-v3" , "bfloat16" , 95 , 3 , "3.8 GB" ),
12+ ("Whisper large-v3" , "whisper-large-v3" , "float16" , 100 , 3 , "3.3 GB" ),
13+ ("Distil Whisper medium.en" , "distil-whisper-medium.en" , "float32" , 160 , 4 , "3.0 GB" ),
14+ ("Distil Whisper medium.en" , "distil-whisper-medium.en" , "bfloat16" , 160 , 4 , "3.0 GB" ),
15+ ("Distil Whisper medium.en" , "distil-whisper-medium.en" , "float16" , 160 , 4 , "3.0 GB" ),
16+ ("Whisper medium" , "whisper-medium" , "float32" , 125 , 5 , "2.8 GB" ),
17+ ("Whisper medium" , "whisper-medium" , "bfloat16" , 135 , 6 , "2.2 GB" ),
18+ ("Whisper medium" , "whisper-medium" , "float16" , 140 , 6 , "2.0 GB" ),
19+ ("Whisper medium.en" , "whisper-medium.en" , "float32" , 130 , 6 , "2.5 GB" ),
20+ ("Whisper medium.en" , "whisper-medium.en" , "bfloat16" , 140 , 7 , "2.0 GB" ),
21+ ("Whisper medium.en" , "whisper-medium.en" , "float16" , 145 , 7 , "1.8 GB" ),
22+ ("Distil Whisper small.en" , "distil-whisper-small.en" , "float32" , 160 , 4 , "3.0 GB" ),
23+ ("Distil Whisper small.en" , "distil-whisper-small.en" , "bfloat16" , 160 , 4 , "3.0 GB" ),
24+ ("Distil Whisper small.en" , "distil-whisper-small.en" , "float16" , 160 , 4 , "3.0 GB" ),
25+ ("Whisper small" , "whisper-small" , "float32" , 175 , 12 , "1.8 GB" ),
26+ ("Whisper small" , "whisper-small" , "bfloat16" , 185 , 13 , "1.4 GB" ),
27+ ("Whisper small" , "whisper-small" , "float16" , 190 , 13 , "1.3 GB" ),
28+ ("Whisper small.en" , "whisper-small.en" , "float32" , 180 , 14 , "1.5 GB" ),
29+ ("Whisper small.en" , "whisper-small.en" , "bfloat16" , 190 , 15 , "1.2 GB" ),
30+ ("Whisper small.en" , "whisper-small.en" , "float16" , 195 , 15 , "1.1 GB" ),
31+ ("Whisper base" , "whisper-base" , "float32" , 225 , 20 , "1.1 GB" ),
32+ ("Whisper base" , "whisper-base" , "bfloat16" , 235 , 21 , "0.9 GB" ),
33+ ("Whisper base" , "whisper-base" , "float16" , 240 , 21 , "0.85 GB" ),
34+ ("Whisper base.en" , "whisper-base.en" , "float32" , 230 , 22 , "1.0 GB" ),
35+ ("Whisper base.en" , "whisper-base.en" , "bfloat16" , 240 , 23 , "0.85 GB" ),
36+ ("Whisper base.en" , "whisper-base.en" , "float16" , 245 , 23 , "0.8 GB" ),
37+ ("Whisper tiny" , "whisper-tiny" , "float32" , 275 , 28 , "0.75 GB" ),
38+ ("Whisper tiny" , "whisper-tiny" , "bfloat16" , 285 , 29 , "0.65 GB" ),
39+ ("Whisper tiny" , "whisper-tiny" , "float16" , 290 , 29 , "0.6 GB" ),
40+ ("Whisper tiny.en" , "whisper-tiny.en" , "float32" , 280 , 30 , "0.7 GB" ),
41+ ("Whisper tiny.en" , "whisper-tiny.en" , "bfloat16" , 290 , 31 , "0.6 GB" ),
42+ ("Whisper tiny.en" , "whisper-tiny.en" , "float16" , 295 , 31 , "0.55 GB" ),
43+ ]
244
345WHISPER_MODELS = {
4- # LARGE-V3
5- 'Distil Whisper large-v3 - float32' : {
6- 'name' : 'Distil Whisper large-v3' ,
7- 'precision' : 'float32' ,
8- 'repo_id' : 'ctranslate2-4you/distil-whisper-large-v3-ct2-float32' ,
9- 'tokens_per_second' : 160 ,
10- 'optimal_batch_size' : 4 ,
11- 'avg_vram_usage' : '3.0 GB'
12- },
13- 'Distil Whisper large-v3 - bfloat16' : {
14- 'name' : 'Distil Whisper large-v3' ,
15- 'precision' : 'bfloat16' ,
16- 'repo_id' : 'ctranslate2-4you/distil-whisper-large-v3-ct2-bfloat16' ,
17- 'tokens_per_second' : 160 ,
18- 'optimal_batch_size' : 4 ,
19- 'avg_vram_usage' : '3.0 GB'
20- },
21- 'Distil Whisper large-v3 - float16' : {
22- 'name' : 'Distil Whisper large-v3' ,
23- 'precision' : 'float16' ,
24- 'repo_id' : 'ctranslate2-4you/distil-whisper-large-v3-ct2-float16' ,
25- 'tokens_per_second' : 160 ,
26- 'optimal_batch_size' : 4 ,
27- 'avg_vram_usage' : '3.0 GB'
28- },
29- 'Whisper large-v3 - float32' : {
30- 'name' : 'Whisper large-v3' ,
31- 'precision' : 'float32' ,
32- 'repo_id' : 'ctranslate2-4you/whisper-large-v3-ct2-float32' ,
33- 'tokens_per_second' : 85 ,
34- 'optimal_batch_size' : 2 ,
35- 'avg_vram_usage' : '5.5 GB'
36- },
37- 'Whisper large-v3 - bfloat16' : {
38- 'name' : 'Whisper large-v3' ,
39- 'precision' : 'bfloat16' ,
40- 'repo_id' : 'ctranslate2-4you/whisper-large-v3-ct2-bfloat16' ,
41- 'tokens_per_second' : 95 ,
42- 'optimal_batch_size' : 3 ,
43- 'avg_vram_usage' : '3.8 GB'
44- },
45- 'Whisper large-v3 - float16' : {
46- 'name' : 'Whisper large-v3' ,
47- 'precision' : 'float16' ,
48- 'repo_id' : 'ctranslate2-4you/whisper-large-v3-ct2-float16' ,
49- 'tokens_per_second' : 100 ,
50- 'optimal_batch_size' : 3 ,
51- 'avg_vram_usage' : '3.3 GB'
52- },
53- # MEDIUM.EN
54- 'Distil Whisper medium.en - float32' : {
55- 'name' : 'Distil Whisper large-v3' ,
56- 'precision' : 'float32' ,
57- 'repo_id' : 'ctranslate2-4you/distil-whisper-medium.en-ct2-float32' ,
58- 'tokens_per_second' : 160 ,
59- 'optimal_batch_size' : 4 ,
60- 'avg_vram_usage' : '3.0 GB'
61- },
62- 'Distil Whisper medium.en - bfloat16' : {
63- 'name' : 'Distil Whisper medium.en' ,
64- 'precision' : 'bfloat16' ,
65- 'repo_id' : 'ctranslate2-4you/distil-whisper-medium.en-ct2-bfloat16' ,
66- 'tokens_per_second' : 160 ,
67- 'optimal_batch_size' : 4 ,
68- 'avg_vram_usage' : '3.0 GB'
69- },
70- 'Distil Whisper medium.en - float16' : {
71- 'name' : 'Distil Whisper medium.en' ,
72- 'precision' : 'float16' ,
73- 'repo_id' : 'ctranslate2-4you/distil-whisper-medium.en-ct2-float16' ,
74- 'tokens_per_second' : 160 ,
75- 'optimal_batch_size' : 4 ,
76- 'avg_vram_usage' : '3.0 GB'
77- },
78- 'Whisper medium.en - float32' : {
79- 'name' : 'Whisper medium.en' ,
80- 'precision' : 'float32' ,
81- 'repo_id' : 'ctranslate2-4you/whisper-medium.en-ct2-float32' ,
82- 'tokens_per_second' : 130 ,
83- 'optimal_batch_size' : 6 ,
84- 'avg_vram_usage' : '2.5 GB'
85- },
86- 'Whisper medium.en - bfloat16' : {
87- 'name' : 'Whisper medium.en' ,
88- 'precision' : 'bfloat16' ,
89- 'repo_id' : 'ctranslate2-4you/whisper-medium.en-ct2-bfloat16' ,
90- 'tokens_per_second' : 140 ,
91- 'optimal_batch_size' : 7 ,
92- 'avg_vram_usage' : '2.0 GB'
93- },
94- 'Whisper medium.en - float16' : {
95- 'name' : 'Whisper medium.en' ,
96- 'precision' : 'float16' ,
97- 'repo_id' : 'ctranslate2-4you/whisper-medium.en-ct2-float16' ,
98- 'tokens_per_second' : 145 ,
99- 'optimal_batch_size' : 7 ,
100- 'avg_vram_usage' : '1.8 GB'
101- },
102- # SMALL.EN
103- 'Distil Whisper small.en - float32' : {
104- 'name' : 'Distil Whisper small.en' ,
105- 'precision' : 'float32' ,
106- 'repo_id' : 'ctranslate2-4you/distil-whisper-small.en-ct2-float32' ,
107- 'tokens_per_second' : 160 ,
108- 'optimal_batch_size' : 4 ,
109- 'avg_vram_usage' : '3.0 GB'
110- },
111- 'Distil Whisper small.en - bfloat16' : {
112- 'name' : 'Distil Whisper small.en' ,
113- 'precision' : 'bfloat16' ,
114- 'repo_id' : 'ctranslate2-4you/distil-whisper-small.en-ct2-bfloat16' ,
115- 'tokens_per_second' : 160 ,
116- 'optimal_batch_size' : 4 ,
117- 'avg_vram_usage' : '3.0 GB'
118- },
119- 'Distil Whisper small.en - float16' : {
120- 'name' : 'Distil Whisper small.en' ,
121- 'precision' : 'float16' ,
122- 'repo_id' : 'ctranslate2-4you/distil-whisper-small.en-ct2-float16' ,
123- 'tokens_per_second' : 160 ,
124- 'optimal_batch_size' : 4 ,
125- 'avg_vram_usage' : '3.0 GB'
126- },
127- 'Whisper small.en - float32' : {
128- 'name' : 'Whisper small.en' ,
129- 'precision' : 'float32' ,
130- 'repo_id' : 'ctranslate2-4you/whisper-small.en-ct2-float32' ,
131- 'tokens_per_second' : 180 ,
132- 'optimal_batch_size' : 14 ,
133- 'avg_vram_usage' : '1.5 GB'
134- },
135- 'Whisper small.en - bfloat16' : {
136- 'name' : 'Whisper small.en' ,
137- 'precision' : 'bfloat16' ,
138- 'repo_id' : 'ctranslate2-4you/whisper-small.en-ct2-bfloat16' ,
139- 'tokens_per_second' : 190 ,
140- 'optimal_batch_size' : 15 ,
141- 'avg_vram_usage' : '1.2 GB'
142- },
143- 'Whisper small.en - float16' : {
144- 'name' : 'Whisper small.en' ,
145- 'precision' : 'float16' ,
146- 'repo_id' : 'ctranslate2-4you/whisper-small.en-ct2-float16' ,
147- 'tokens_per_second' : 195 ,
148- 'optimal_batch_size' : 15 ,
149- 'avg_vram_usage' : '1.1 GB'
150- },
151- # BASE.EN
152- 'Whisper base.en - float32' : {
153- 'name' : 'Whisper base.en' ,
154- 'precision' : 'float32' ,
155- 'repo_id' : 'ctranslate2-4you/whisper-base.en-ct2-float32' ,
156- 'tokens_per_second' : 230 ,
157- 'optimal_batch_size' : 22 ,
158- 'avg_vram_usage' : '1.0 GB'
159- },
160- 'Whisper base.en - bfloat16' : {
161- 'name' : 'Whisper base.en' ,
162- 'precision' : 'bfloat16' ,
163- 'repo_id' : 'ctranslate2-4you/whisper-base.en-ct2-bfloat16' ,
164- 'tokens_per_second' : 240 ,
165- 'optimal_batch_size' : 23 ,
166- 'avg_vram_usage' : '0.85 GB'
167- },
168- 'Whisper base.en - float16' : {
169- 'name' : 'Whisper base.en' ,
170- 'precision' : 'float16' ,
171- 'repo_id' : 'ctranslate2-4you/whisper-base.en-ct2-float16' ,
172- 'tokens_per_second' : 245 ,
173- 'optimal_batch_size' : 23 ,
174- 'avg_vram_usage' : '0.8 GB'
175- },
176- # TINY.EN
177- 'Whisper tiny.en - float32' : {
178- 'name' : 'Whisper tiny.en' ,
179- 'precision' : 'float32' ,
180- 'repo_id' : 'ctranslate2-4you/whisper-tiny.en-ct2-float32' ,
181- 'tokens_per_second' : 280 ,
182- 'optimal_batch_size' : 30 ,
183- 'avg_vram_usage' : '0.7 GB'
184- },
185- 'Whisper tiny.en - bfloat16' : {
186- 'name' : 'Whisper tiny.en' ,
187- 'precision' : 'bfloat16' ,
188- 'repo_id' : 'ctranslate2-4you/whisper-tiny.en-ct2-bfloat16' ,
189- 'tokens_per_second' : 290 ,
190- 'optimal_batch_size' : 31 ,
191- 'avg_vram_usage' : '0.6 GB'
192- },
193- 'Whisper tiny.en - float16' : {
194- 'name' : 'Whisper tiny.en' ,
195- 'precision' : 'float16' ,
196- 'repo_id' : 'ctranslate2-4you/whisper-tiny.en-ct2-float16' ,
197- 'tokens_per_second' : 295 ,
198- 'optimal_batch_size' : 31 ,
199- 'avg_vram_usage' : '0.55 GB'
200- },
46+ f"{ name } - { prec } " : {
47+ 'name' : name ,
48+ 'precision' : prec ,
49+ 'repo_id' : f'ctranslate2-4you/{ slug } -ct2-{ prec } ' ,
50+ 'tokens_per_second' : tps ,
51+ 'optimal_batch_size' : batch ,
52+ 'avg_vram_usage' : vram ,
53+ }
54+ for name , slug , prec , tps , batch , vram in _MODEL_SPECS
20155}
20256
57+ MODEL_NAMES = list (OrderedDict .fromkeys (name for name , * _ in _MODEL_SPECS ))
58+
59+ MODEL_PRECISIONS = {}
60+ for name , slug , prec , * _ in _MODEL_SPECS :
61+ MODEL_PRECISIONS .setdefault (name , []).append (prec )
62+
63+ DISTIL_MODELS = frozenset (name for name , * _ in _MODEL_SPECS if name .startswith ("Distil" ))
64+
65+ WHISPER_LANGUAGES = OrderedDict ([
66+ ("af" , "Afrikaans" ), ("am" , "Amharic" ), ("ar" , "Arabic" ), ("as" , "Assamese" ),
67+ ("az" , "Azerbaijani" ), ("ba" , "Bashkir" ), ("be" , "Belarusian" ), ("bg" , "Bulgarian" ),
68+ ("bn" , "Bengali" ), ("bo" , "Tibetan" ), ("br" , "Breton" ), ("bs" , "Bosnian" ),
69+ ("ca" , "Catalan" ), ("cs" , "Czech" ), ("cy" , "Welsh" ), ("da" , "Danish" ),
70+ ("de" , "German" ), ("el" , "Greek" ), ("en" , "English" ), ("es" , "Spanish" ),
71+ ("et" , "Estonian" ), ("eu" , "Basque" ), ("fa" , "Persian" ), ("fi" , "Finnish" ),
72+ ("fo" , "Faroese" ), ("fr" , "French" ), ("gl" , "Galician" ), ("gu" , "Gujarati" ),
73+ ("ha" , "Hausa" ), ("haw" , "Hawaiian" ), ("he" , "Hebrew" ), ("hi" , "Hindi" ),
74+ ("hr" , "Croatian" ), ("ht" , "Haitian Creole" ), ("hu" , "Hungarian" ), ("hy" , "Armenian" ),
75+ ("id" , "Indonesian" ), ("is" , "Icelandic" ), ("it" , "Italian" ), ("ja" , "Japanese" ),
76+ ("jw" , "Javanese" ), ("ka" , "Georgian" ), ("kk" , "Kazakh" ), ("km" , "Khmer" ),
77+ ("kn" , "Kannada" ), ("ko" , "Korean" ), ("la" , "Latin" ), ("lb" , "Luxembourgish" ),
78+ ("ln" , "Lingala" ), ("lo" , "Lao" ), ("lt" , "Lithuanian" ), ("lv" , "Latvian" ),
79+ ("mg" , "Malagasy" ), ("mi" , "Maori" ), ("mk" , "Macedonian" ), ("ml" , "Malayalam" ),
80+ ("mn" , "Mongolian" ), ("mr" , "Marathi" ), ("ms" , "Malay" ), ("mt" , "Maltese" ),
81+ ("my" , "Myanmar" ), ("ne" , "Nepali" ), ("nl" , "Dutch" ), ("nn" , "Nynorsk" ),
82+ ("no" , "Norwegian" ), ("oc" , "Occitan" ), ("pa" , "Punjabi" ), ("pl" , "Polish" ),
83+ ("ps" , "Pashto" ), ("pt" , "Portuguese" ), ("ro" , "Romanian" ), ("ru" , "Russian" ),
84+ ("sa" , "Sanskrit" ), ("sd" , "Sindhi" ), ("si" , "Sinhala" ), ("sk" , "Slovak" ),
85+ ("sl" , "Slovenian" ), ("sn" , "Shona" ), ("so" , "Somali" ), ("sq" , "Albanian" ),
86+ ("sr" , "Serbian" ), ("su" , "Sundanese" ), ("sv" , "Swedish" ), ("sw" , "Swahili" ),
87+ ("ta" , "Tamil" ), ("te" , "Telugu" ), ("tg" , "Tajik" ), ("th" , "Thai" ),
88+ ("tk" , "Turkmen" ), ("tl" , "Tagalog" ), ("tr" , "Turkish" ), ("tt" , "Tatar" ),
89+ ("uk" , "Ukrainian" ), ("ur" , "Urdu" ), ("uz" , "Uzbek" ), ("vi" , "Vietnamese" ),
90+ ("yi" , "Yiddish" ), ("yo" , "Yoruba" ), ("zh" , "Chinese" ),
91+ ])
20392
204- # File extensions
20593SUPPORTED_AUDIO_EXTENSIONS = [
206- ".aac" , ".amr" , ".asf" , ".avi" , ".flac" , ".m4a" ,
94+ ".aac" , ".amr" , ".asf" , ".avi" , ".flac" , ".m4a" ,
20795 ".mkv" , ".mp3" , ".mp4" , ".wav" , ".webm" , ".wma"
20896]
20997
210- # Output formats
21198OUTPUT_FORMATS = ["txt" , "vtt" , "srt" , "tsv" , "json" ]
212-
213- # Task modes
21499TASK_MODES = ["transcribe" , "translate" ]
215100
216- # Default settings
217101DEFAULT_BEAM_SIZE = 1
218102DEFAULT_BATCH_SIZE = 8
219103DEFAULT_OUTPUT_FORMAT = "txt"
220- DEFAULT_TASK_MODE = "transcribe"
104+ DEFAULT_TASK_MODE = "transcribe"
105+ DEFAULT_LANGUAGE = "en"
0 commit comments