1
1
# SPDX-License-Identifier: Apache-2.0
2
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
3
from collections .abc import Iterable , Mapping , Sequence
4
- from typing import Annotated , Optional , Union
4
+ from typing import Annotated , Literal , Optional , Union
5
5
6
6
import torch
7
7
import torch .nn as nn
38
38
# yapf: enable
39
39
from .interfaces import MultiModalEmbeddings , SupportsMultiModal , SupportsQuant
40
40
from .llama import LlamaDecoderLayer , LlamaMLP , LlamaModel
41
- from .utils import (AutoWeightsLoader , WeightsMapper , flatten_bn ,
42
- is_pp_missing_parameter , maybe_prefix )
41
+ from .utils import (AutoWeightsLoader , WeightsMapper , is_pp_missing_parameter ,
42
+ maybe_prefix )
43
43
44
44
45
45
class AriaImagePixelInputs (TensorSchema ):
@@ -52,6 +52,8 @@ class AriaImagePixelInputs(TensorSchema):
52
52
- w: Width of each image
53
53
"""
54
54
55
+ type : Literal ["pixel_values" ]
56
+
55
57
pixel_values : Annotated [
56
58
torch .Tensor ,
57
59
TensorShape ("bn" , 3 , "h" , "w" ),
@@ -485,6 +487,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
485
487
This model combines a vision tower, a multi-modal projector, and a language
486
488
model to perform tasks that involve both image and text inputs.
487
489
"""
490
+ merge_by_field_config = True
491
+
488
492
hf_to_vllm_mapper = WeightsMapper (
489
493
orig_to_new_prefix = {
490
494
# mapping for new names in checkpoint saved after transformers v4.52
@@ -551,12 +555,15 @@ def _parse_and_validate_image_input(
551
555
return None
552
556
553
557
return AriaImagePixelInputs (
554
- pixel_values = flatten_bn (pixel_values , concat = True ),
555
- pixel_mask = flatten_bn (pixel_mask , concat = True ),
558
+ type = "pixel_values" ,
559
+ pixel_values = pixel_values ,
560
+ pixel_mask = pixel_mask ,
556
561
)
557
562
558
563
def _create_patch_attention_mask (
559
- self , pixel_mask : Optional [torch .Tensor ]) -> torch .Tensor :
564
+ self ,
565
+ pixel_mask : Optional [torch .Tensor ],
566
+ ) -> Optional [torch .Tensor ]:
560
567
if pixel_mask is None :
561
568
return None
562
569
0 commit comments