|
19 | 19 | from transformers import ( |
20 | 20 | AutoConfig, |
21 | 21 | AutoImageProcessor, |
22 | | - AutoModelForCausalLM, |
23 | | - AutoModelForVision2Seq, |
24 | 22 | GenerationConfig, |
25 | 23 | GenerationMixin, |
26 | 24 | PretrainedConfig, |
|
44 | 42 | ) |
45 | 43 |
|
46 | 44 |
|
47 | | -try: |
48 | | - from transformers import LlavaForConditionalGeneration |
49 | | -except ImportError: |
50 | | - LlavaForConditionalGeneration = None |
| 45 | +if is_transformers_version(">=", "4.46.0"): |
| 46 | + from transformers import AutoModelForImageTextToText |
51 | 47 |
|
52 | | -try: |
53 | | - from transformers import LlavaNextForConditionalGeneration |
54 | | -except ImportError: |
55 | | - LlavaNextForConditionalGeneration = None |
| 48 | + transformers_auto_class = AutoModelForImageTextToText |
| 49 | +else: |
| 50 | + from transformers import AutoModelForVision2Seq |
| 51 | + |
| 52 | + transformers_auto_class = AutoModelForVision2Seq |
56 | 53 |
|
57 | 54 |
|
58 | 55 | if TYPE_CHECKING: |
@@ -346,7 +343,7 @@ def forward(self, audio_feature, audio_mask): |
346 | 343 | class OVModelForVisualCausalLM(OVBaseModel, GenerationMixin): |
347 | 344 | export_feature = "image-text-to-text" |
348 | 345 | additional_parts = [] |
349 | | - auto_model_class = AutoModelForCausalLM |
| 346 | + auto_model_class = transformers_auto_class |
350 | 347 |
|
351 | 348 | def __init__( |
352 | 349 | self, |
@@ -412,10 +409,7 @@ def __init__( |
412 | 409 |
|
413 | 410 | # Avoid warnings when creating a transformers pipeline |
414 | 411 | AutoConfig.register(self.base_model_prefix, AutoConfig) |
415 | | - try: |
416 | | - self.auto_model_class.register(AutoConfig, self.__class__) |
417 | | - except AttributeError: |
418 | | - pass |
| 412 | + self.auto_model_class.register(AutoConfig, self.__class__) |
419 | 413 |
|
420 | 414 | def clear_requests(self): |
421 | 415 | if self._compile_only: |
@@ -931,8 +925,6 @@ def preprocess_inputs( |
931 | 925 |
|
932 | 926 |
|
933 | 927 | class _OVLlavaForCausalLM(OVModelForVisualCausalLM): |
934 | | - auto_model_class = LlavaForConditionalGeneration |
935 | | - |
936 | 928 | def __init__( |
937 | 929 | self, |
938 | 930 | language_model: ov.Model, |
@@ -1137,8 +1129,6 @@ def preprocess_inputs( |
1137 | 1129 |
|
1138 | 1130 |
|
1139 | 1131 | class _OVLlavaNextForCausalLM(_OVLlavaForCausalLM): |
1140 | | - auto_model_class = LlavaNextForConditionalGeneration |
1141 | | - |
1142 | 1132 | # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_next/modeling_llava_next.py#L655 |
1143 | 1133 | def pack_image_features(self, image_features, image_sizes, image_newline=None): |
1144 | 1134 | from transformers.models.llava_next.modeling_llava_next import get_anyres_image_grid_shape, unpad_image |
@@ -1433,7 +1423,6 @@ def get_text_embeddings(self, input_ids, **kwargs): |
1433 | 1423 |
|
1434 | 1424 | class _OVLlavaNextVideoForCausalLM(_OVLlavaNextForCausalLM): |
1435 | 1425 | additional_parts = ["vision_resampler", "multi_modal_projector"] |
1436 | | - auto_model_class = AutoModelForVision2Seq |
1437 | 1426 |
|
1438 | 1427 | def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): |
1439 | 1428 | if input_ids is not None and input_ids.shape[1] == 1: |
|
0 commit comments