|
32 | 32 | import torch.nn as nn
|
33 | 33 | import torch.nn.functional as F
|
34 | 34 | from einops import rearrange, repeat
|
35 |
| -from transformers import BatchFeature |
| 35 | +from transformers import AutoConfig, BatchFeature |
36 | 36 | from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
|
37 | 37 | Qwen2VLProcessor)
|
38 | 38 | from transformers.models.qwen2_vl.configuration_qwen2_vl import (
|
39 | 39 | Qwen2VLConfig, Qwen2VLVisionConfig)
|
40 | 40 | from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
| 41 | +from transformers.models.qwen2_vl.video_processing_qwen2_vl import ( |
| 42 | + Qwen2VLVideoProcessor) |
41 | 43 |
|
42 | 44 | from vllm.config import VllmConfig
|
43 | 45 | from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
|
|
69 | 71 | from vllm.transformers_utils.config import uses_mrope
|
70 | 72 | from vllm.transformers_utils.processor import (
|
71 | 73 | cached_image_processor_from_config)
|
| 74 | +from vllm.transformers_utils.tokenizer import AnyTokenizer |
72 | 75 |
|
73 | 76 | from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
74 | 77 | SupportsMultiModal, SupportsPP)
|
@@ -1405,3 +1408,87 @@ def get_mm_mapping(self) -> MultiModelKeys:
|
1405 | 1408 | connector="visual.merger.",
|
1406 | 1409 | tower_model="visual.",
|
1407 | 1410 | )
|
| 1411 | + |
| 1412 | + |
| 1413 | +class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor): |
| 1414 | + pass |
| 1415 | + |
| 1416 | + |
| 1417 | +class Tarsier2ImageProcessor(Qwen2VLImageProcessor): |
| 1418 | + |
| 1419 | + def __init__( |
| 1420 | + self, |
| 1421 | + size: Optional[dict[str, int]] = None, |
| 1422 | + **kwargs, |
| 1423 | + ) -> None: |
| 1424 | + if size is not None and "min_pixels" in size and "max_pixels" in size: |
| 1425 | + # Remap if Tarsier2-specific format is provided |
| 1426 | + remapped_size = { |
| 1427 | + "shortest_edge": size["min_pixels"], |
| 1428 | + "longest_edge": size["max_pixels"] |
| 1429 | + } |
| 1430 | + super().__init__(size=remapped_size, **kwargs) |
| 1431 | + else: |
| 1432 | + super().__init__(size=size, **kwargs) |
| 1433 | + |
| 1434 | + |
| 1435 | +class Tarsier2Processor(Qwen2VLProcessor): |
| 1436 | + |
| 1437 | + def __init__( |
| 1438 | + self, |
| 1439 | + vision_config: dict, |
| 1440 | + tokenizer: AnyTokenizer, |
| 1441 | + **kwargs, |
| 1442 | + ): |
| 1443 | + self.image_processor = Tarsier2ImageProcessor(**vision_config) |
| 1444 | + super().__init__(image_processor=self.image_processor, |
| 1445 | + tokenizer=tokenizer, |
| 1446 | + video_processor=Qwen2VLVideoProcessor(), |
| 1447 | + chat_template=None, |
| 1448 | + **kwargs) |
| 1449 | + |
| 1450 | + |
| 1451 | +class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo): |
| 1452 | + |
| 1453 | + def get_hf_config(self) -> Qwen2VLConfig: |
| 1454 | + model_path = self.ctx.model_config.model |
| 1455 | + original_config = AutoConfig.from_pretrained(model_path) |
| 1456 | + config_dict = original_config.to_dict() |
| 1457 | + correct_config = Qwen2VLConfig.from_dict(config_dict) |
| 1458 | + |
| 1459 | + return correct_config |
| 1460 | + |
| 1461 | + def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor: |
| 1462 | + return Tarsier2Processor( |
| 1463 | + vision_config=self.ctx.get_hf_image_processor_config(), |
| 1464 | + tokenizer=self.get_tokenizer(), |
| 1465 | + **kwargs, |
| 1466 | + ) |
| 1467 | + |
| 1468 | + def get_image_processor(self) -> Tarsier2ImageProcessor: |
| 1469 | + return Tarsier2ImageProcessor( |
| 1470 | + **self.ctx.get_hf_image_processor_config()) |
| 1471 | + |
| 1472 | + |
| 1473 | +@MULTIMODAL_REGISTRY.register_processor(Tarsier2MultiModalProcessor, |
| 1474 | + info=Tarsier2ProcessingInfo, |
| 1475 | + dummy_inputs=Qwen2VLDummyInputsBuilder) |
| 1476 | +class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration): |
| 1477 | + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ |
| 1478 | + "vision_tower.": "visual.", |
| 1479 | + }) |
| 1480 | + |
| 1481 | + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): |
| 1482 | + # Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig |
| 1483 | + # as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig. |
| 1484 | + config = vllm_config.model_config.hf_config |
| 1485 | + qwen2vl_config = config.text_config |
| 1486 | + qwen2vl_config.architectures = config.architectures |
| 1487 | + vllm_config.model_config.hf_config = qwen2vl_config |
| 1488 | + super().__init__(vllm_config=vllm_config, prefix=prefix) |
| 1489 | + |
| 1490 | + def load_weights(self, weights: Iterable[tuple[str, |
| 1491 | + torch.Tensor]]) -> set[str]: |
| 1492 | + |
| 1493 | + loader = AutoWeightsLoader(self) |
| 1494 | + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) |
0 commit comments