Skip to content

Commit 66cf208

Browse files
princepridefhl2000
authored andcommitted
[New model support]Support Tarsier2 (vllm-project#19887)
Signed-off-by: 汪志鹏 <[email protected]> Signed-off-by: fhl <[email protected]>
1 parent c0079d4 commit 66cf208

File tree

7 files changed

+152
-1
lines changed

7 files changed

+152
-1
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,7 @@ Specified using `--task generate`.
562562
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
563563
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
564564
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
565+
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
565566

566567
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
567568
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:

examples/offline_inference/vision_language.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
10401040
)
10411041

10421042

1043+
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
1044+
model_name = "omni-research/Tarsier2-Recap-7b"
1045+
1046+
engine_args = EngineArgs(
1047+
model=model_name,
1048+
max_model_len=4096,
1049+
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
1050+
limit_mm_per_prompt={modality: 1},
1051+
)
1052+
1053+
if modality == "image":
1054+
placeholder = "<|image_pad|>"
1055+
elif modality == "video":
1056+
placeholder = "<|video_pad|>"
1057+
1058+
prompts = [
1059+
(
1060+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
1061+
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
1062+
f"{question}<|im_end|>\n"
1063+
"<|im_start|>assistant\n"
1064+
)
1065+
for question in questions
1066+
]
1067+
1068+
return ModelRequestData(
1069+
engine_args=engine_args,
1070+
prompts=prompts,
1071+
)
1072+
1073+
10431074
# SkyworkR1V
10441075
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
10451076
assert modality == "image"
@@ -1112,6 +1143,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
11121143
"skywork_chat": run_skyworkr1v,
11131144
"smolvlm": run_smolvlm,
11141145
"tarsier": run_tarsier,
1146+
"tarsier2": run_tarsier2,
11151147
}
11161148

11171149

examples/offline_inference/vision_language_multi_image.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,32 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
828828
)
829829

830830

831+
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
832+
model_name = "omni-research/Tarsier2-Recap-7b"
833+
834+
engine_args = EngineArgs(
835+
model=model_name,
836+
trust_remote_code=True,
837+
max_model_len=32768,
838+
limit_mm_per_prompt={"image": len(image_urls)},
839+
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
840+
)
841+
842+
prompt = (
843+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
844+
f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
845+
f"<|vision_end|>{question}<|im_end|>\n"
846+
"<|im_start|>assistant\n"
847+
)
848+
image_data = [fetch_image(url) for url in image_urls]
849+
850+
return ModelRequestData(
851+
engine_args=engine_args,
852+
prompt=prompt,
853+
image_data=image_data,
854+
)
855+
856+
831857
model_example_map = {
832858
"aria": load_aria,
833859
"aya_vision": load_aya_vision,
@@ -853,6 +879,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
853879
"qwen2_5_vl": load_qwen2_5_vl,
854880
"smolvlm": load_smolvlm,
855881
"tarsier": load_tarsier,
882+
"tarsier2": load_tarsier2,
856883
}
857884

858885

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ def _test_processing_correctness_one(
284284
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
285285
"openai/whisper-large-v3",
286286
"omni-research/Tarsier-7b",
287+
"omni-research/Tarsier2-Recap-7b"
287288
])
288289
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
289290
@pytest.mark.parametrize("num_batches", [32])

tests/models/registry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,8 @@ def check_available_online(
398398
trust_remote_code=True),
399399
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
400400
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
401+
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
402+
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
401403
# [Encoder-decoder]
402404
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
403405
# Therefore, we borrow the BartTokenizer from the original Bart model

vllm/model_executor/models/qwen2_vl.py

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,14 @@
3232
import torch.nn as nn
3333
import torch.nn.functional as F
3434
from einops import rearrange, repeat
35-
from transformers import BatchFeature
35+
from transformers import AutoConfig, BatchFeature
3636
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
3737
Qwen2VLProcessor)
3838
from transformers.models.qwen2_vl.configuration_qwen2_vl import (
3939
Qwen2VLConfig, Qwen2VLVisionConfig)
4040
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
41+
from transformers.models.qwen2_vl.video_processing_qwen2_vl import (
42+
Qwen2VLVideoProcessor)
4143

4244
from vllm.config import VllmConfig
4345
from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
@@ -69,6 +71,7 @@
6971
from vllm.transformers_utils.config import uses_mrope
7072
from vllm.transformers_utils.processor import (
7173
cached_image_processor_from_config)
74+
from vllm.transformers_utils.tokenizer import AnyTokenizer
7275

7376
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
7477
SupportsMultiModal, SupportsPP)
@@ -1405,3 +1408,87 @@ def get_mm_mapping(self) -> MultiModelKeys:
14051408
connector="visual.merger.",
14061409
tower_model="visual.",
14071410
)
1411+
1412+
1413+
class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor):
1414+
pass
1415+
1416+
1417+
class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
1418+
1419+
def __init__(
1420+
self,
1421+
size: Optional[dict[str, int]] = None,
1422+
**kwargs,
1423+
) -> None:
1424+
if size is not None and "min_pixels" in size and "max_pixels" in size:
1425+
# Remap if Tarsier2-specific format is provided
1426+
remapped_size = {
1427+
"shortest_edge": size["min_pixels"],
1428+
"longest_edge": size["max_pixels"]
1429+
}
1430+
super().__init__(size=remapped_size, **kwargs)
1431+
else:
1432+
super().__init__(size=size, **kwargs)
1433+
1434+
1435+
class Tarsier2Processor(Qwen2VLProcessor):
1436+
1437+
def __init__(
1438+
self,
1439+
vision_config: dict,
1440+
tokenizer: AnyTokenizer,
1441+
**kwargs,
1442+
):
1443+
self.image_processor = Tarsier2ImageProcessor(**vision_config)
1444+
super().__init__(image_processor=self.image_processor,
1445+
tokenizer=tokenizer,
1446+
video_processor=Qwen2VLVideoProcessor(),
1447+
chat_template=None,
1448+
**kwargs)
1449+
1450+
1451+
class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
1452+
1453+
def get_hf_config(self) -> Qwen2VLConfig:
1454+
model_path = self.ctx.model_config.model
1455+
original_config = AutoConfig.from_pretrained(model_path)
1456+
config_dict = original_config.to_dict()
1457+
correct_config = Qwen2VLConfig.from_dict(config_dict)
1458+
1459+
return correct_config
1460+
1461+
def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor:
1462+
return Tarsier2Processor(
1463+
vision_config=self.ctx.get_hf_image_processor_config(),
1464+
tokenizer=self.get_tokenizer(),
1465+
**kwargs,
1466+
)
1467+
1468+
def get_image_processor(self) -> Tarsier2ImageProcessor:
1469+
return Tarsier2ImageProcessor(
1470+
**self.ctx.get_hf_image_processor_config())
1471+
1472+
1473+
@MULTIMODAL_REGISTRY.register_processor(Tarsier2MultiModalProcessor,
1474+
info=Tarsier2ProcessingInfo,
1475+
dummy_inputs=Qwen2VLDummyInputsBuilder)
1476+
class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
1477+
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
1478+
"vision_tower.": "visual.",
1479+
})
1480+
1481+
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
1482+
# Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
1483+
# as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig.
1484+
config = vllm_config.model_config.hf_config
1485+
qwen2vl_config = config.text_config
1486+
qwen2vl_config.architectures = config.architectures
1487+
vllm_config.model_config.hf_config = qwen2vl_config
1488+
super().__init__(vllm_config=vllm_config, prefix=prefix)
1489+
1490+
def load_weights(self, weights: Iterable[tuple[str,
1491+
torch.Tensor]]) -> set[str]:
1492+
1493+
loader = AutoWeightsLoader(self)
1494+
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

vllm/model_executor/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@
217217
"UltravoxModel": ("ultravox", "UltravoxModel"),
218218
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
219219
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
220+
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
220221
# [Encoder-decoder]
221222
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
222223
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501

0 commit comments

Comments
 (0)