Skip to content

Commit 5d98d56

Browse files
authored
Support Pixtral-Large HF by using llava multimodal_projector_bias config (vllm-project#12710)
Signed-off-by: mgoin <michael@neuralmagic.com>
1 parent 73b35cc commit 5d98d56

File tree

4 files changed

+13
-9
lines changed

4 files changed

+13
-9
lines changed

vllm/model_executor/models/llava.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,19 +75,20 @@ def __init__(self,
7575
vision_hidden_size: int,
7676
text_hidden_size: int,
7777
projector_hidden_act: str,
78+
multimodal_projector_bias: bool,
7879
quant_config: Optional[QuantizationConfig] = None,
7980
prefix: str = ""):
8081
super().__init__()
8182

8283
self.linear_1 = ColumnParallelLinear(vision_hidden_size,
8384
text_hidden_size,
84-
bias=True,
85+
bias=multimodal_projector_bias,
8586
quant_config=quant_config,
8687
prefix=f"{prefix}.linear_1")
8788
self.act = get_act_fn(projector_hidden_act)
8889
self.linear_2 = RowParallelLinear(text_hidden_size,
8990
text_hidden_size,
90-
bias=True,
91+
bias=multimodal_projector_bias,
9192
quant_config=quant_config,
9293
prefix=f"{prefix}.linear_2")
9394

@@ -503,6 +504,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
503504
vision_hidden_size=config.vision_config.hidden_size,
504505
text_hidden_size=config.text_config.hidden_size,
505506
projector_hidden_act=config.projector_hidden_act,
507+
multimodal_projector_bias=config.multimodal_projector_bias,
506508
quant_config=quant_config,
507509
prefix=maybe_prefix(prefix, "multi_modal_projector"))
508510

vllm/model_executor/models/llava_next.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
231231
self.multi_modal_projector = LlavaMultiModalProjector(
232232
vision_hidden_size=vision_hidden_size,
233233
text_hidden_size=config.text_config.hidden_size,
234-
projector_hidden_act=config.projector_hidden_act)
234+
projector_hidden_act=config.projector_hidden_act,
235+
multimodal_projector_bias=config.multimodal_projector_bias)
235236

236237
self.language_model = init_vllm_registered_model(
237238
vllm_config=vllm_config,

vllm/model_executor/models/llava_next_video.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -253,16 +253,16 @@ def forward(self, image_features: torch.Tensor):
253253
class LlavaNextMultiModalProjector(nn.Module):
254254

255255
def __init__(self, vision_hidden_size: int, text_hidden_size: int,
256-
projector_hidden_act: str):
256+
projector_hidden_act: str, multimodal_projector_bias: bool):
257257
super().__init__()
258258

259259
self.linear_1 = nn.Linear(vision_hidden_size,
260260
text_hidden_size,
261-
bias=True)
261+
bias=multimodal_projector_bias)
262262
self.act = get_act_fn(projector_hidden_act)
263263
self.linear_2 = nn.Linear(text_hidden_size,
264264
text_hidden_size,
265-
bias=True)
265+
bias=multimodal_projector_bias)
266266

267267
def forward(self, image_features: torch.Tensor) -> torch.Tensor:
268268
hidden_states = self.linear_1(image_features)
@@ -298,7 +298,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
298298
self.multi_modal_projector = LlavaNextMultiModalProjector(
299299
vision_hidden_size=config.vision_config.hidden_size,
300300
text_hidden_size=config.text_config.hidden_size,
301-
projector_hidden_act=config.projector_hidden_act)
301+
projector_hidden_act=config.projector_hidden_act,
302+
multimodal_projector_bias=config.multimodal_projector_bias)
302303
self.language_model = init_vllm_registered_model(
303304
vllm_config=vllm_config,
304305
hf_config=config.text_config,

vllm/model_executor/models/llava_onevision.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -372,11 +372,11 @@ def __init__(self, config: LlavaOnevisionConfig):
372372

373373
self.linear_1 = nn.Linear(config.vision_config.hidden_size,
374374
config.text_config.hidden_size,
375-
bias=True)
375+
bias=config.multimodal_projector_bias)
376376
self.act = get_act_fn(config.projector_hidden_act)
377377
self.linear_2 = nn.Linear(config.text_config.hidden_size,
378378
config.text_config.hidden_size,
379-
bias=True)
379+
bias=config.multimodal_projector_bias)
380380

381381
def forward(self, image_features: torch.Tensor) -> torch.Tensor:
382382
hidden_states = self.linear_1(image_features)

0 commit comments

Comments
 (0)