1
1
# SPDX-License-Identifier: Apache-2.0
2
2
"""
3
- This example shows how to use vLLM for running offline inference
3
+ This example shows how to use vLLM for running offline inference
4
4
with the correct prompt format on audio language models.
5
5
6
6
For most models, the prompt format should follow corresponding examples
7
7
on HuggingFace model repository.
8
8
"""
9
+
9
10
import os
10
11
from dataclasses import asdict
11
12
from typing import NamedTuple , Optional
22
23
question_per_audio_count = {
23
24
0 : "What is 1+1?" ,
24
25
1 : "What is recited in the audio?" ,
25
- 2 : "What sport and what nursery rhyme are referenced?"
26
+ 2 : "What sport and what nursery rhyme are referenced?" ,
26
27
}
27
28
28
29
@@ -72,8 +73,7 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
72
73
# MiniCPM-O
73
74
def run_minicpmo (question : str , audio_count : int ) -> ModelRequestData :
74
75
model_name = "openbmb/MiniCPM-o-2_6"
75
- tokenizer = AutoTokenizer .from_pretrained (model_name ,
76
- trust_remote_code = True )
76
+ tokenizer = AutoTokenizer .from_pretrained (model_name , trust_remote_code = True )
77
77
engine_args = EngineArgs (
78
78
model = model_name ,
79
79
trust_remote_code = True ,
@@ -82,19 +82,18 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
82
82
limit_mm_per_prompt = {"audio" : audio_count },
83
83
)
84
84
85
- stop_tokens = [' <|im_end|>' , ' <|endoftext|>' ]
85
+ stop_tokens = [" <|im_end|>" , " <|endoftext|>" ]
86
86
stop_token_ids = [tokenizer .convert_tokens_to_ids (i ) for i in stop_tokens ]
87
87
88
88
audio_placeholder = "(<audio>./</audio>)" * audio_count
89
89
audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n ' + message['content'] + '<|im_end|>' + '\n '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n <|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501
90
- messages = [{
91
- 'role' : 'user' ,
92
- 'content' : f'{ audio_placeholder } \n { question } '
93
- }]
94
- prompt = tokenizer .apply_chat_template (messages ,
95
- tokenize = False ,
96
- add_generation_prompt = True ,
97
- chat_template = audio_chat_template )
90
+ messages = [{"role" : "user" , "content" : f"{ audio_placeholder } \n { question } " }]
91
+ prompt = tokenizer .apply_chat_template (
92
+ messages ,
93
+ tokenize = False ,
94
+ add_generation_prompt = True ,
95
+ chat_template = audio_chat_template ,
96
+ )
98
97
99
98
return ModelRequestData (
100
99
engine_args = engine_args ,
@@ -113,7 +112,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
113
112
# Since the vision-lora and speech-lora co-exist with the base model,
114
113
# we have to manually specify the path of the lora weights.
115
114
speech_lora_path = os .path .join (model_path , "speech-lora" )
116
- placeholders = "" .join ([f"<|audio_{ i + 1 } |>" for i in range (audio_count )])
115
+ placeholders = "" .join ([f"<|audio_{ i + 1 } |>" for i in range (audio_count )])
117
116
118
117
prompts = f"<|user|>{ placeholders } { question } <|end|><|assistant|>"
119
118
@@ -145,15 +144,19 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
145
144
limit_mm_per_prompt = {"audio" : audio_count },
146
145
)
147
146
148
- audio_in_prompt = "" .join ([
149
- f"Audio { idx + 1 } : "
150
- f"<|audio_bos|><|AUDIO|><|audio_eos|>\n " for idx in range (audio_count )
151
- ])
147
+ audio_in_prompt = "" .join (
148
+ [
149
+ f"Audio { idx + 1 } : <|audio_bos|><|AUDIO|><|audio_eos|>\n "
150
+ for idx in range (audio_count )
151
+ ]
152
+ )
152
153
153
- prompt = ("<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
154
- "<|im_start|>user\n "
155
- f"{ audio_in_prompt } { question } <|im_end|>\n "
156
- "<|im_start|>assistant\n " )
154
+ prompt = (
155
+ "<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
156
+ "<|im_start|>user\n "
157
+ f"{ audio_in_prompt } { question } <|im_end|>\n "
158
+ "<|im_start|>assistant\n "
159
+ )
157
160
158
161
return ModelRequestData (
159
162
engine_args = engine_args ,
@@ -172,19 +175,22 @@ def run_qwen2_5_omni(question: str, audio_count: int):
172
175
limit_mm_per_prompt = {"audio" : audio_count },
173
176
)
174
177
175
- audio_in_prompt = "" .join ([
176
- "<|audio_bos|><|AUDIO|><|audio_eos|>\n " for idx in range (audio_count )
177
- ] )
178
+ audio_in_prompt = "" .join (
179
+ [ "<|audio_bos|><|AUDIO|><|audio_eos|>\n " for idx in range (audio_count )]
180
+ )
178
181
179
182
default_system = (
180
183
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
181
184
"Group, capable of perceiving auditory and visual inputs, as well as "
182
- "generating text and speech." )
185
+ "generating text and speech."
186
+ )
183
187
184
- prompt = (f"<|im_start|>system\n { default_system } <|im_end|>\n "
185
- "<|im_start|>user\n "
186
- f"{ audio_in_prompt } { question } <|im_end|>\n "
187
- "<|im_start|>assistant\n " )
188
+ prompt = (
189
+ f"<|im_start|>system\n { default_system } <|im_end|>\n "
190
+ "<|im_start|>user\n "
191
+ f"{ audio_in_prompt } { question } <|im_end|>\n "
192
+ "<|im_start|>assistant\n "
193
+ )
188
194
return ModelRequestData (
189
195
engine_args = engine_args ,
190
196
prompt = prompt ,
@@ -196,13 +202,10 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
196
202
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
197
203
198
204
tokenizer = AutoTokenizer .from_pretrained (model_name )
199
- messages = [{
200
- 'role' : 'user' ,
201
- 'content' : "<|audio|>\n " * audio_count + question
202
- }]
203
- prompt = tokenizer .apply_chat_template (messages ,
204
- tokenize = False ,
205
- add_generation_prompt = True )
205
+ messages = [{"role" : "user" , "content" : "<|audio|>\n " * audio_count + question }]
206
+ prompt = tokenizer .apply_chat_template (
207
+ messages , tokenize = False , add_generation_prompt = True
208
+ )
206
209
207
210
engine_args = EngineArgs (
208
211
model = model_name ,
@@ -220,8 +223,7 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
220
223
221
224
# Whisper
222
225
def run_whisper (question : str , audio_count : int ) -> ModelRequestData :
223
- assert audio_count == 1 , (
224
- "Whisper only support single audio input per prompt" )
226
+ assert audio_count == 1 , "Whisper only support single audio input per prompt"
225
227
model_name = "openai/whisper-large-v3-turbo"
226
228
227
229
prompt = "<|startoftranscript|>"
@@ -252,27 +254,33 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
252
254
253
255
def parse_args ():
254
256
parser = FlexibleArgumentParser (
255
- description = 'Demo on using vLLM for offline inference with '
256
- 'audio language models' )
257
- parser .add_argument ('--model-type' ,
258
- '-m' ,
259
- type = str ,
260
- default = "ultravox" ,
261
- choices = model_example_map .keys (),
262
- help = 'Huggingface "model_type".' )
263
- parser .add_argument ('--num-prompts' ,
264
- type = int ,
265
- default = 1 ,
266
- help = 'Number of prompts to run.' )
267
- parser .add_argument ("--num-audios" ,
268
- type = int ,
269
- default = 1 ,
270
- choices = [0 , 1 , 2 ],
271
- help = "Number of audio items per prompt." )
272
- parser .add_argument ("--seed" ,
273
- type = int ,
274
- default = None ,
275
- help = "Set the seed when initializing `vllm.LLM`." )
257
+ description = "Demo on using vLLM for offline inference with "
258
+ "audio language models"
259
+ )
260
+ parser .add_argument (
261
+ "--model-type" ,
262
+ "-m" ,
263
+ type = str ,
264
+ default = "ultravox" ,
265
+ choices = model_example_map .keys (),
266
+ help = 'Huggingface "model_type".' ,
267
+ )
268
+ parser .add_argument (
269
+ "--num-prompts" , type = int , default = 1 , help = "Number of prompts to run."
270
+ )
271
+ parser .add_argument (
272
+ "--num-audios" ,
273
+ type = int ,
274
+ default = 1 ,
275
+ choices = [0 , 1 , 2 ],
276
+ help = "Number of audio items per prompt." ,
277
+ )
278
+ parser .add_argument (
279
+ "--seed" ,
280
+ type = int ,
281
+ default = None ,
282
+ help = "Set the seed when initializing `vllm.LLM`." ,
283
+ )
276
284
277
285
return parser .parse_args ()
278
286
@@ -283,29 +291,30 @@ def main(args):
283
291
raise ValueError (f"Model type { model } is not supported." )
284
292
285
293
audio_count = args .num_audios
286
- req_data = model_example_map [model ](question_per_audio_count [audio_count ],
287
- audio_count )
294
+ req_data = model_example_map [model ](
295
+ question_per_audio_count [audio_count ], audio_count
296
+ )
288
297
289
298
# Disable other modalities to save memory
290
299
default_limits = {"image" : 0 , "video" : 0 , "audio" : 0 }
291
300
req_data .engine_args .limit_mm_per_prompt = default_limits | dict (
292
- req_data .engine_args .limit_mm_per_prompt or {})
301
+ req_data .engine_args .limit_mm_per_prompt or {}
302
+ )
293
303
294
304
engine_args = asdict (req_data .engine_args ) | {"seed" : args .seed }
295
305
llm = LLM (** engine_args )
296
306
297
307
# We set temperature to 0.2 so that outputs can be different
298
308
# even when all prompts are identical when running batch inference.
299
- sampling_params = SamplingParams (temperature = 0.2 ,
300
- max_tokens = 64 ,
301
- stop_token_ids = req_data . stop_token_ids )
309
+ sampling_params = SamplingParams (
310
+ temperature = 0.2 , max_tokens = 64 , stop_token_ids = req_data . stop_token_ids
311
+ )
302
312
303
313
mm_data = {}
304
314
if audio_count > 0 :
305
315
mm_data = {
306
316
"audio" : [
307
- asset .audio_and_sample_rate
308
- for asset in audio_assets [:audio_count ]
317
+ asset .audio_and_sample_rate for asset in audio_assets [:audio_count ]
309
318
]
310
319
}
311
320
@@ -315,8 +324,9 @@ def main(args):
315
324
# Batch inference
316
325
inputs = [inputs ] * args .num_prompts
317
326
# Add LoRA request if applicable
318
- lora_request = (req_data .lora_requests *
319
- args .num_prompts if req_data .lora_requests else None )
327
+ lora_request = (
328
+ req_data .lora_requests * args .num_prompts if req_data .lora_requests else None
329
+ )
320
330
321
331
outputs = llm .generate (
322
332
inputs ,
0 commit comments