Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit 1ca0d4f

Browse files
authored
[Model] Add UltravoxModel and UltravoxConfig (vllm-project#7615)
1 parent dd53c4b commit 1ca0d4f

33 files changed

+1090
-264
lines changed

docs/source/models/supported_models.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ Multimodal Language Models
186186

187187
* - Architecture
188188
- Models
189-
- Supported Modality(ies)
189+
- Supported Modalities
190190
- Example HuggingFace Models
191191
- :ref:`LoRA <lora>`
192192
* - :code:`Blip2ForConditionalGeneration`
@@ -234,6 +234,11 @@ Multimodal Language Models
234234
- Image
235235
- :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
236236
-
237+
* - :code: `UltravoxModel`
238+
- Ultravox
239+
- Audio
240+
- :code: `fixie-ai/ultravox-v0_3`
241+
-
237242

238243
.. note::
239244
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""
2+
This example shows how to use vLLM for running offline inference
3+
with the correct prompt format on vision language models.
4+
5+
For most models, the prompt format should follow corresponding examples
6+
on HuggingFace model repository.
7+
"""
8+
from transformers import AutoTokenizer
9+
10+
from vllm import LLM, SamplingParams
11+
from vllm.assets.audio import AudioAsset
12+
from vllm.utils import FlexibleArgumentParser
13+
14+
# Input audio and question
15+
audio_and_sample_rate = AudioAsset("mary_had_lamb").audio_and_sample_rate
16+
question = "What is recited in the audio?"
17+
18+
19+
# Ultravox 0.3
20+
def run_ultravox(question):
21+
model_name = "fixie-ai/ultravox-v0_3"
22+
23+
tokenizer = AutoTokenizer.from_pretrained(model_name)
24+
messages = [{
25+
'role': 'user',
26+
'content': f"<|reserved_special_token_0|>\n{question}"
27+
}]
28+
prompt = tokenizer.apply_chat_template(messages,
29+
tokenize=False,
30+
add_generation_prompt=True)
31+
32+
llm = LLM(model=model_name)
33+
stop_token_ids = None
34+
return llm, prompt, stop_token_ids
35+
36+
37+
model_example_map = {
38+
"ultravox": run_ultravox,
39+
}
40+
41+
42+
def main(args):
43+
model = args.model_type
44+
if model not in model_example_map:
45+
raise ValueError(f"Model type {model} is not supported.")
46+
47+
llm, prompt, stop_token_ids = model_example_map[model](question)
48+
49+
# We set temperature to 0.2 so that outputs can be different
50+
# even when all prompts are identical when running batch inference.
51+
sampling_params = SamplingParams(temperature=0.2,
52+
max_tokens=64,
53+
stop_token_ids=stop_token_ids)
54+
55+
assert args.num_prompts > 0
56+
if args.num_prompts == 1:
57+
# Single inference
58+
inputs = {
59+
"prompt": prompt,
60+
"multi_modal_data": {
61+
"audio": audio_and_sample_rate
62+
},
63+
}
64+
65+
else:
66+
# Batch inference
67+
inputs = [{
68+
"prompt": prompt,
69+
"multi_modal_data": {
70+
"audio": audio_and_sample_rate
71+
},
72+
} for _ in range(args.num_prompts)]
73+
74+
outputs = llm.generate(inputs, sampling_params=sampling_params)
75+
76+
for o in outputs:
77+
generated_text = o.outputs[0].text
78+
print(generated_text)
79+
80+
81+
if __name__ == "__main__":
82+
parser = FlexibleArgumentParser(
83+
description='Demo on using vLLM for offline inference with '
84+
'audio language models')
85+
parser.add_argument('--model-type',
86+
'-m',
87+
type=str,
88+
default="ultravox",
89+
choices=model_example_map.keys(),
90+
help='Huggingface "model_type".')
91+
parser.add_argument('--num-prompts',
92+
type=int,
93+
default=1,
94+
help='Number of prompts to run.')
95+
96+
args = parser.parse_args()
97+
main(args)
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
"""An example showing how to use vLLM to serve VLMs.
2+
3+
Launch the vLLM server with the following command:
4+
vllm serve fixie-ai/ultravox-v0_3
5+
"""
6+
import base64
7+
8+
import requests
9+
from openai import OpenAI
10+
11+
from vllm.assets.audio import AudioAsset
12+
13+
# Modify OpenAI's API key and API base to use vLLM's API server.
14+
openai_api_key = "EMPTY"
15+
openai_api_base = "http://localhost:8000/v1"
16+
17+
client = OpenAI(
18+
# defaults to os.environ.get("OPENAI_API_KEY")
19+
api_key=openai_api_key,
20+
base_url=openai_api_base,
21+
)
22+
23+
models = client.models.list()
24+
model = models.data[0].id
25+
26+
# Any format supported by librosa is supported
27+
audio_url = AudioAsset("winning_call").url
28+
29+
# Use audio url in the payload
30+
chat_completion_from_url = client.chat.completions.create(
31+
messages=[{
32+
"role":
33+
"user",
34+
"content": [
35+
{
36+
"type": "text",
37+
"text": "What's in this audio?"
38+
},
39+
{
40+
"type": "audio_url",
41+
"audio_url": {
42+
"url": audio_url
43+
},
44+
},
45+
],
46+
}],
47+
model=model,
48+
max_tokens=64,
49+
)
50+
51+
result = chat_completion_from_url.choices[0].message.content
52+
print(f"Chat completion output:{result}")
53+
54+
55+
# Use base64 encoded audio in the payload
56+
def encode_audio_base64_from_url(audio_url: str) -> str:
57+
"""Encode an audio retrieved from a remote url to base64 format."""
58+
59+
with requests.get(audio_url) as response:
60+
response.raise_for_status()
61+
result = base64.b64encode(response.content).decode('utf-8')
62+
63+
return result
64+
65+
66+
audio_base64 = encode_audio_base64_from_url(audio_url=audio_url)
67+
chat_completion_from_base64 = client.chat.completions.create(
68+
messages=[{
69+
"role":
70+
"user",
71+
"content": [
72+
{
73+
"type": "text",
74+
"text": "What's in this audio?"
75+
},
76+
{
77+
"type": "audio_url",
78+
"audio_url": {
79+
# Any format supported by librosa is supported
80+
"url": f"data:audio/ogg;base64,{audio_base64}"
81+
},
82+
},
83+
],
84+
}],
85+
model=model,
86+
max_tokens=64,
87+
)
88+
89+
result = chat_completion_from_base64.choices[0].message.content
90+
print(f"Chat completion output:{result}")

tests/conftest.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
1010
TypeVar, Union)
1111

12+
import numpy as np
1213
import pytest
1314
import torch
1415
import torch.nn as nn
1516
import torch.nn.functional as F
1617
from huggingface_hub import snapshot_download
1718
from PIL import Image
18-
from transformers import (AutoModelForCausalLM, AutoModelForSeq2SeqLM,
19-
AutoModelForVision2Seq, AutoTokenizer, BatchEncoding,
19+
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
2020
BatchFeature)
2121

2222
from vllm import LLM, SamplingParams
@@ -216,8 +216,7 @@ def __init__(
216216
*,
217217
model_kwargs: Optional[Dict[str, Any]] = None,
218218
is_embedding_model: bool = False,
219-
is_vision_model: bool = False,
220-
is_encoder_decoder_model: bool = False,
219+
auto_cls=AutoModelForCausalLM,
221220
postprocess_inputs: Callable[[BatchEncoding],
222221
BatchEncoding] = identity,
223222
) -> None:
@@ -234,13 +233,6 @@ def __init__(
234233
device="cpu",
235234
).to(dtype=torch_dtype))
236235
else:
237-
if is_vision_model:
238-
auto_cls = AutoModelForVision2Seq
239-
elif is_encoder_decoder_model:
240-
auto_cls = AutoModelForSeq2SeqLM
241-
else:
242-
auto_cls = AutoModelForCausalLM
243-
244236
model_kwargs = model_kwargs if model_kwargs is not None else {}
245237
self.model = self.wrap_device(
246238
auto_cls.from_pretrained(
@@ -432,6 +424,7 @@ def generate_greedy_logprobs_limit(
432424
max_tokens: int,
433425
num_logprobs: int,
434426
images: Optional[List[Image.Image]] = None,
427+
audios: Optional[List[Tuple[np.ndarray, int]]] = None,
435428
**kwargs: Any,
436429
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
437430
all_logprobs: List[List[Dict[int, float]]] = []
@@ -446,6 +439,11 @@ def generate_greedy_logprobs_limit(
446439
if images is not None and images[i] is not None:
447440
processor_kwargs["images"] = images[i]
448441

442+
if audios is not None:
443+
audio, sr = audios[i]
444+
processor_kwargs["audio"] = audio
445+
processor_kwargs["sampling_rate"] = sr
446+
449447
inputs = self.processor(**processor_kwargs)
450448
inputs = self.postprocess_inputs(inputs)
451449

@@ -627,6 +625,8 @@ def generate_w_logprobs(
627625
sampling_params: SamplingParams,
628626
images: Optional[Union[List[Image.Image],
629627
List[List[Image.Image]]]] = None,
628+
audios: Optional[Union[List[Tuple[np.ndarray, int]],
629+
List[List[Tuple[np.ndarray, int]]]]] = None
630630
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
631631
assert sampling_params.logprobs is not None
632632

@@ -638,6 +638,10 @@ def generate_w_logprobs(
638638
for i, image in enumerate(images):
639639
inputs[i]["multi_modal_data"] = {"image": image}
640640

641+
if audios is not None:
642+
for i, audio in enumerate(audios):
643+
inputs[i]["multi_modal_data"] = {"audio": audio}
644+
641645
req_outputs = self.model.generate(inputs,
642646
sampling_params=sampling_params)
643647
return self._final_steps_generate_w_logprobs(req_outputs)
@@ -674,6 +678,8 @@ def generate_greedy_logprobs(
674678
num_logprobs: int,
675679
images: Optional[Union[List[Image.Image],
676680
List[List[Image.Image]]]] = None,
681+
audios: Optional[Union[List[Tuple[np.ndarray, int]],
682+
List[List[Tuple[np.ndarray, int]]]]] = None,
677683
stop_token_ids: Optional[List[int]] = None,
678684
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
679685
greedy_logprobs_params = SamplingParams(temperature=0.0,
@@ -682,7 +688,8 @@ def generate_greedy_logprobs(
682688
stop_token_ids=stop_token_ids)
683689
outputs = self.generate_w_logprobs(prompts,
684690
greedy_logprobs_params,
685-
images=images)
691+
images=images,
692+
audios=audios)
686693

687694
return [(output_ids, output_str, output_logprobs)
688695
for output_ids, output_str, output_logprobs in outputs]

tests/distributed/test_basic_distributed_correctness_enc_dec.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"""
1111

1212
import pytest
13+
from transformers import AutoModelForSeq2SeqLM
1314

1415
from vllm.utils import cuda_device_count_stateless
1516

@@ -85,7 +86,7 @@ def test_models(
8586
}
8687

8788
with hf_runner(model, dtype=dtype,
88-
is_encoder_decoder_model=True) as hf_model:
89+
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
8990
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
9091
test_prompts,
9192
max_tokens,

0 commit comments

Comments
 (0)