diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 4ce8cd01de4d..6ced23932e7d 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1861,6 +1861,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH _supports_attention_backend = False _can_record_outputs = None + # Attributes used mainly in multimodal LLMs, though all models contain a valid field for these + # Possible values are: text, image, video, audio and time + input_modalities: Union[str, list[str]] = "text" # most models are text + @property @torch._dynamo.allow_in_graph def can_record_outputs(self) -> dict[str, OutputRecorder]: @@ -2224,6 +2228,20 @@ def base_model(self) -> nn.Module: """ return getattr(self, self.base_model_prefix, self) + @classmethod + def output_modalities(cls) -> Optional[Union[str, list[str]]]: + """ + Returns a list of output modalities that a model can generate. For non-generative models + returns a `None`. Multimodal models that can output several modalities or non-text modalities + should overwrite this method. + + Returns: + `Union[str, list[str]]`: Output modalities supported for models that can call `.generate()`. + """ + if cls.can_generate(): + return "text" + return None + @classmethod def can_generate(cls) -> bool: """ diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py index c29270b5687d..e61d4d6d9ac6 100644 --- a/src/transformers/models/aimv2/modeling_aimv2.py +++ b/src/transformers/models/aimv2/modeling_aimv2.py @@ -393,6 +393,7 @@ class Aimv2PreTrainedModel(PreTrainedModel): config: Aimv2Config base_model_prefix = "aimv2" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = [ "Aimv2EncoderLayer", diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py index 18ef50e5bcc1..4238d73b81b7 100644 --- a/src/transformers/models/aimv2/modular_aimv2.py +++ b/src/transformers/models/aimv2/modular_aimv2.py @@ -437,6 +437,7 @@ class Aimv2PreTrainedModel(PreTrainedModel): config: Aimv2Config base_model_prefix = "aimv2" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = [ "Aimv2EncoderLayer", diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index f55c84b47176..a296e9f0b93f 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -839,6 +839,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class AlignPreTrainedModel(PreTrainedModel): config: AlignConfig base_model_prefix = "align" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True def _init_weights(self, module: nn.Module): @@ -868,6 +869,7 @@ def _init_weights(self, module: nn.Module): ) class AlignTextModel(AlignPreTrainedModel): config: AlignTextConfig + input_modalities = "text" _no_split_modules = ["AlignTextEmbeddings"] def __init__(self, config: AlignTextConfig, add_pooling_layer: bool = True): @@ -988,6 +990,7 @@ def forward( class AlignVisionModel(AlignPreTrainedModel): config: AlignVisionConfig main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False def __init__(self, config: AlignVisionConfig): diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index f40caa7af4ce..e3fc1b7ef93c 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -810,6 +810,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals class AltCLIPPreTrainedModel(PreTrainedModel): config: AltCLIPConfig base_model_prefix = "altclip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_module = [] @@ -914,6 +915,7 @@ def forward( class AltCLIPVisionModel(AltCLIPPreTrainedModel): config: AltCLIPVisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: AltCLIPVisionConfig): super().__init__(config) @@ -1080,6 +1082,7 @@ def forward( class AltCLIPTextModel(AltCLIPPreTrainedModel): config: AltCLIPTextConfig + input_modalities = "text" def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index fa6ef38045a3..693557072fff 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -574,6 +574,7 @@ def forward( class AriaTextPreTrainedModel(PreTrainedModel): config: AriaTextConfig base_model_prefix = "model" + input_modalities = ["image", "text"] _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 1d820c00cf0a..06f1915c03a3 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1207,6 +1207,7 @@ def __init__(self, config: AriaTextConfig, layer_idx: int): class AriaTextPreTrainedModel(PreTrainedModel): config: AriaTextConfig base_model_prefix = "model" + input_modalities = ["image", "text"] _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index 928190579906..a7785828b76f 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -308,6 +308,7 @@ def forward(self, hidden_states: torch.Tensor) -> BaseModelOutput: class ASTPreTrainedModel(PreTrainedModel): config: ASTConfig base_model_prefix = "audio_spectrogram_transformer" + input_modalities = "audio" main_input_name = "input_values" supports_gradient_checkpointing = True _supports_sdpa = True diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py index 9e583b0b8187..47847e4ebfea 100644 --- a/src/transformers/models/autoformer/modeling_autoformer.py +++ b/src/transformers/models/autoformer/modeling_autoformer.py @@ -825,6 +825,7 @@ def forward( class AutoformerPreTrainedModel(PreTrainedModel): config: AutoformerConfig base_model_prefix = "model" + input_modalities = "time" main_input_name = "past_values" supports_gradient_checkpointing = True diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py index bccbea0264b7..39f9d70fcc7b 100644 --- a/src/transformers/models/aya_vision/modeling_aya_vision.py +++ b/src/transformers/models/aya_vision/modeling_aya_vision.py @@ -91,6 +91,7 @@ def pixel_shuffle(self, image_features): # B, S, D class AyaVisionPreTrainedModel(PreTrainedModel): config: AyaVisionConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index e4e4cb5cdea2..7cd46c476b71 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -321,6 +321,7 @@ def forward( @auto_docstring class BarkPreTrainedModel(PreTrainedModel): config: BarkConfig + output_modalities = "audio" supports_gradient_checkpointing = False _supports_flash_attn = True diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index 0728600795d2..e82d4235d399 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -704,6 +704,7 @@ def forward( class BeitPreTrainedModel(PreTrainedModel): config: BeitConfig base_model_prefix = "beit" + input_modalities = "image" main_input_name = "pixel_values" supports_gradient_checkpointing = True _no_split_modules = ["BeitLayer"] diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py index 616c6d31d339..916f99a1556e 100644 --- a/src/transformers/models/bit/modeling_bit.py +++ b/src/transformers/models/bit/modeling_bit.py @@ -624,6 +624,7 @@ def forward( class BitPreTrainedModel(PreTrainedModel): config: BitConfig base_model_prefix = "bit" + input_modalities = "image" main_input_name = "pixel_values" _no_split_modules = ["BitEmbeddings"] diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 7a586e92f0ee..a269861b5d9c 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -414,6 +414,7 @@ def forward( class BlipPreTrainedModel(PreTrainedModel): config: BlipConfig base_model_prefix = "blip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["BlipEncoderLayer", "BlipTextEmbeddings"] _skip_keys_device_placement = ["past_key_values"] @@ -482,6 +483,7 @@ def forward( class BlipVisionModel(BlipPreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" config: BlipVisionConfig _can_record_outputs = { "hidden_states": BlipEncoderLayer, diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 99823a69385d..b228340eb545 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -391,6 +391,7 @@ def forward( class Blip2PreTrainedModel(PreTrainedModel): config: Blip2Config base_model_prefix = "blip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _supports_attention_backend = True _supports_flash_attn = True @@ -473,6 +474,7 @@ def forward( # Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2 class Blip2VisionModel(Blip2PreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" config: Blip2VisionConfig _can_record_outputs = { "hidden_states": Blip2EncoderLayer, @@ -1536,6 +1538,7 @@ def forward( @auto_docstring class Blip2VisionModelWithProjection(Blip2PreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" _keep_in_fp32_modules = ["query_tokens", "qformer"] _supports_flash_attn = False # because self.qformer does not support FA2 @@ -2007,6 +2010,7 @@ def generate( ) class Blip2ForImageTextRetrieval(Blip2PreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" _keep_in_fp32_modules = ["query_tokens", "qformer"] _supports_flash_attn = False # because self.qformer does not support FA2 diff --git a/src/transformers/models/blt/modeling_blt.py b/src/transformers/models/blt/modeling_blt.py index 3f5459ccbff0..eff8994c04f6 100644 --- a/src/transformers/models/blt/modeling_blt.py +++ b/src/transformers/models/blt/modeling_blt.py @@ -422,6 +422,7 @@ def forward( class BltPreTrainedModel(PreTrainedModel): config: BltConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["BltTransformerLayer"] _can_compile_fullgraph = False # static cache cannot have different shapes for each layer diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 896ee175c7b1..e6e0fb82ba28 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -995,6 +995,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l class BridgeTowerPreTrainedModel(PreTrainedModel): config: BridgeTowerConfig base_model_prefix = "bridgetower" + input_modalities = ["image", "text"] supports_gradient_checkpointing = False _no_split_modules = ["BridgeTowerSelfAttention", "BridgeTowerResidualAttention"] _skip_keys_device_placement = "past_key_values" @@ -1028,6 +1029,7 @@ def _init_weights(self, module: nn.Module): class BridgeTowerVisionModel(BridgeTowerPreTrainedModel): config: BridgeTowerVisionConfig + input_modalities = "image" def __init__(self, config): super().__init__(config) @@ -1057,6 +1059,7 @@ def forward(self, image, image_mask=None, interpolate_pos_encoding=False): ) class BridgeTowerTextModel(BridgeTowerPreTrainedModel): config: BridgeTowerTextConfig + input_modalities = "text" def __init__(self, config, add_pooling_layer=True): r""" diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 033b8ecd7c63..4587a68dca68 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -799,6 +799,7 @@ def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor: class ChameleonPreTrainedModel(PreTrainedModel): config: ChameleonConfig base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["ChameleonDecoderLayer", "ChameleonSwinDecoderLayer"] _skip_keys_device_placement = ["past_key_values", "causal_mask"] diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 9872b397b318..c63565cb5701 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -578,6 +578,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class ChineseCLIPPreTrainedModel(PreTrainedModel): config: ChineseCLIPConfig base_model_prefix = "chinese_clip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True def _init_weights(self, module): @@ -814,6 +815,7 @@ class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel): """ config: ChineseCLIPTextConfig + input_modalities = "text" _no_split_modules = ["ChineseCLIPTextEmbeddings"] def __init__(self, config, add_pooling_layer=True): @@ -929,6 +931,7 @@ def forward( class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel): config: ChineseCLIPVisionConfig main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["ChineseCLIPVisionEmbeddings", "ChineseCLIPVisionAttention"] def __init__(self, config: ChineseCLIPVisionConfig): diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 885286ea3f49..ed57db98ea63 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1343,6 +1343,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class ClapPreTrainedModel(PreTrainedModel): config: ClapConfig base_model_prefix = "clap" + input_modalities = ["audio", "text"] supports_gradient_checkpointing = False def _init_weights(self, module: nn.Module): @@ -1372,6 +1373,7 @@ def _init_weights(self, module: nn.Module): class ClapAudioModel(ClapPreTrainedModel): config: ClapAudioConfig main_input_name = "input_features" + input_modalities = "audio" def __init__(self, config: ClapAudioConfig): super().__init__(config) @@ -1444,6 +1446,7 @@ def forward( ) class ClapTextModel(ClapPreTrainedModel): config: ClapTextConfig + input_modalities = "text" def __init__(self, config, add_pooling_layer=True): r""" @@ -1748,6 +1751,7 @@ def forward( @auto_docstring class ClapTextModelWithProjection(ClapPreTrainedModel): config: ClapTextConfig + input_modalities = "text" def __init__(self, config: ClapTextConfig): super().__init__(config) @@ -1814,6 +1818,7 @@ def forward( class ClapAudioModelWithProjection(ClapPreTrainedModel): config: ClapAudioConfig main_input_name = "input_features" + input_modalities = "audio" def __init__(self, config: ClapAudioConfig): super().__init__(config) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 196381f33bbd..8cb2907af0a0 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -419,6 +419,7 @@ def forward( class CLIPPreTrainedModel(PreTrainedModel): config: CLIPConfig base_model_prefix = "clip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _supports_sdpa = True _supports_flash_attn = True @@ -661,6 +662,7 @@ def forward( ) class CLIPTextModel(CLIPPreTrainedModel): config: CLIPTextConfig + input_modalities = "text" _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] _supports_flash_attn = False # mask creation only accounts for sdpa/eager @@ -768,6 +770,7 @@ def forward( class CLIPVisionModel(CLIPPreTrainedModel): config: CLIPVisionConfig main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["CLIPEncoderLayer"] def __init__(self, config: CLIPVisionConfig): @@ -1028,6 +1031,7 @@ def forward( @auto_docstring class CLIPTextModelWithProjection(CLIPPreTrainedModel): config: CLIPTextConfig + input_modalities = "text" _supports_flash_attn = False _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] @@ -1098,6 +1102,7 @@ def forward( class CLIPVisionModelWithProjection(CLIPPreTrainedModel): config: CLIPVisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: CLIPVisionConfig): super().__init__(config) @@ -1168,6 +1173,7 @@ def forward( ) class CLIPForImageClassification(CLIPPreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: CLIPConfig) -> None: super().__init__(config) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 3db986aa040f..20ed2a716495 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -423,6 +423,7 @@ def forward( class CLIPSegPreTrainedModel(PreTrainedModel): config: CLIPSegConfig base_model_prefix = "clip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True def _init_weights(self, module): @@ -647,6 +648,7 @@ def forward( class CLIPSegTextModel(CLIPSegPreTrainedModel): config: CLIPSegTextConfig + input_modalities = "text" _no_split_modules = ["CLIPSegTextEmbeddings", "CLIPSegEncoderLayer"] @@ -752,6 +754,7 @@ def forward( class CLIPSegVisionModel(CLIPSegPreTrainedModel): config: CLIPSegVisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: CLIPSegVisionConfig): super().__init__(config) diff --git a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py index 5337f0dac45a..c041ce831fe5 100644 --- a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py @@ -130,6 +130,7 @@ class Cohere2VisionCausalLMOutputWithPast(ModelOutput): class Cohere2VisionPreTrainedModel(PreTrainedModel): config: Cohere2VisionConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/colpali/modeling_colpali.py b/src/transformers/models/colpali/modeling_colpali.py index fe92252b9a80..16ced722841c 100644 --- a/src/transformers/models/colpali/modeling_colpali.py +++ b/src/transformers/models/colpali/modeling_colpali.py @@ -32,6 +32,7 @@ class ColPaliPreTrainedModel(PreTrainedModel): config: ColPaliConfig base_model_prefix = "model" + input_modalities = ["image", "text"] _no_split_modules = [] _supports_sdpa = True _supports_flash_attn = True diff --git a/src/transformers/models/colqwen2/modeling_colqwen2.py b/src/transformers/models/colqwen2/modeling_colqwen2.py index fc0f585531ae..0c22fb99c887 100644 --- a/src/transformers/models/colqwen2/modeling_colqwen2.py +++ b/src/transformers/models/colqwen2/modeling_colqwen2.py @@ -40,6 +40,7 @@ class ColQwen2PreTrainedModel(PreTrainedModel): config: ColQwen2Config base_model_prefix = "model" + input_modalities = ["image", "text"] _no_split_modules = [] _supports_sdpa = True _supports_flash_attn = True diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 2ee35cc19a3f..1b9660c8b22a 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -967,6 +967,7 @@ class ConditionalDetrPreTrainedModel(PreTrainedModel): config: ConditionalDetrConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [r"ConditionalDetrConvEncoder", r"ConditionalDetrEncoderLayer", r"ConditionalDetrDecoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py index e3224c29405f..bcdca46a84e6 100755 --- a/src/transformers/models/convnext/modeling_convnext.py +++ b/src/transformers/models/convnext/modeling_convnext.py @@ -236,6 +236,7 @@ class ConvNextPreTrainedModel(PreTrainedModel): config: ConvNextConfig base_model_prefix = "convnext" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["ConvNextLayer"] _can_record_outputs = {} # hidden states are collected explicitly diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py index 3bf6130824ed..d206ededf0ee 100644 --- a/src/transformers/models/convnextv2/modeling_convnextv2.py +++ b/src/transformers/models/convnextv2/modeling_convnextv2.py @@ -257,6 +257,7 @@ class ConvNextV2PreTrainedModel(PreTrainedModel): config: ConvNextV2Config base_model_prefix = "convnextv2" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["ConvNextV2Layer"] def _init_weights(self, module): diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py index 18b80b1ef12d..b0b957b3f2f7 100644 --- a/src/transformers/models/csm/modeling_csm.py +++ b/src/transformers/models/csm/modeling_csm.py @@ -366,6 +366,8 @@ def forward( class CsmPreTrainedModel(PreTrainedModel): config: CsmConfig base_model_prefix = "model" + input_modalities = ["audio", "text"] + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["CsmDecoderLayer"] _skip_keys_device_placement = ["past_key_values"] diff --git a/src/transformers/models/csm/modular_csm.py b/src/transformers/models/csm/modular_csm.py index 89a6e52a063b..b71f7411d6dd 100644 --- a/src/transformers/models/csm/modular_csm.py +++ b/src/transformers/models/csm/modular_csm.py @@ -124,6 +124,8 @@ class CsmDecoderLayer(LlamaDecoderLayer): class CsmPreTrainedModel(PreTrainedModel): config: CsmConfig base_model_prefix = "model" + input_modalities = ["audio", "text"] + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["CsmDecoderLayer"] _skip_keys_device_placement = ["past_key_values"] diff --git a/src/transformers/models/d_fine/modeling_d_fine.py b/src/transformers/models/d_fine/modeling_d_fine.py index cdc008e3c7bb..9918a933428e 100644 --- a/src/transformers/models/d_fine/modeling_d_fine.py +++ b/src/transformers/models/d_fine/modeling_d_fine.py @@ -441,6 +441,7 @@ class DFinePreTrainedModel(PreTrainedModel): config: DFineConfig base_model_prefix = "d_fine" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [r"DFineHybridEncoder", r"DFineDecoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/dab_detr/modeling_dab_detr.py b/src/transformers/models/dab_detr/modeling_dab_detr.py index cbb7450c7f0b..891a90105667 100644 --- a/src/transformers/models/dab_detr/modeling_dab_detr.py +++ b/src/transformers/models/dab_detr/modeling_dab_detr.py @@ -812,6 +812,7 @@ class DabDetrPreTrainedModel(PreTrainedModel): config: DabDetrConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [r"DabDetrConvEncoder", r"DabDetrEncoderLayer", r"DabDetrDecoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py index e97c8183651e..81cfcbb931d4 100644 --- a/src/transformers/models/dac/modeling_dac.py +++ b/src/transformers/models/dac/modeling_dac.py @@ -557,6 +557,8 @@ def remove_weight_norm(self): """ ) class DacModel(DacPreTrainedModel): + input_modalities = "audio" + def __init__(self, config: DacConfig): super().__init__(config) self.config = config diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 3107b6884778..b8db20969c95 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -493,6 +493,7 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel): config: Data2VecAudioConfig base_model_prefix = "data2vec_audio" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index 2152c7e92bae..3ce848e3b581 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -718,6 +718,7 @@ def forward( class Data2VecVisionPreTrainedModel(PreTrainedModel): config: Data2VecVisionConfig base_model_prefix = "data2vec_vision" + input_modalities = "image" main_input_name = "pixel_values" supports_gradient_checkpointing = True _no_split_modules = ["Data2VecVisionLayer"] diff --git a/src/transformers/models/data2vec/modular_data2vec_audio.py b/src/transformers/models/data2vec/modular_data2vec_audio.py index 91cb04730e4a..142bf7a5e783 100644 --- a/src/transformers/models/data2vec/modular_data2vec_audio.py +++ b/src/transformers/models/data2vec/modular_data2vec_audio.py @@ -138,6 +138,7 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel): config: Data2VecAudioConfig base_model_prefix = "data2vec_audio" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py index ce884da8d08b..cbf516483188 100644 --- a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py @@ -122,6 +122,8 @@ def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor: class DeepseekVLPreTrainedModel(PreTrainedModel): config: DeepseekVLConfig base_model_prefix = "model" + input_modalities = ["image", "text"] + output_modalities = "text" supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] _skip_keys_device_placement = ["past_key_values", "causal_mask"] diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py index ed5f7d655e34..bc9b1d4bca0e 100644 --- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -133,6 +133,7 @@ def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor: class DeepseekVLPreTrainedModel(JanusPreTrainedModel): + output_modalities = "text" _no_split_modules = ["LlamaDecoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py index d9a85654e901..11edfa97ef11 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py @@ -204,6 +204,8 @@ def forward( class DeepseekVLHybridPreTrainedModel(PreTrainedModel): config: DeepseekVLHybridConfig base_model_prefix = "model" + input_modalities = ["image", "text"] + output_modalities = "text" supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] _skip_keys_device_placement = ["past_key_values", "causal_mask"] diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 657e78be87ef..04a45b413c73 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -923,6 +923,7 @@ class DeformableDetrPreTrainedModel(PreTrainedModel): config: DeformableDetrConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = [ r"DeformableDetrConvEncoder", diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index dfa2f191e789..b28ae27fcabb 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -374,6 +374,7 @@ class DeiTPreTrainedModel(PreTrainedModel): config: DeiTConfig base_model_prefix = "deit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["DeiTLayer"] _supports_sdpa = True diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index 5710016bd513..862b77807d3a 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -213,6 +213,7 @@ class DepthAnythingPreTrainedModel(PreTrainedModel): config: DepthAnythingConfig base_model_prefix = "depth_anything" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 153ddfc1f513..c8a90eaaef02 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -602,6 +602,7 @@ class DepthProPreTrainedModel(PreTrainedModel): config: DepthProConfig base_model_prefix = "depth_pro" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _supports_sdpa = True _no_split_modules = ["DepthProPreActResidualLayer"] diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 89441a8b1246..f0378c25a381 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -724,6 +724,7 @@ class DetrPreTrainedModel(PreTrainedModel): config: DetrConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/dia/modeling_dia.py b/src/transformers/models/dia/modeling_dia.py index 3025c8de4faa..39061f2a4a4e 100644 --- a/src/transformers/models/dia/modeling_dia.py +++ b/src/transformers/models/dia/modeling_dia.py @@ -64,6 +64,7 @@ class DiaPreTrainedModel(PreTrainedModel): config: DiaConfig base_model_prefix = "model" + output_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/dia/modular_dia.py b/src/transformers/models/dia/modular_dia.py index 432f0298430c..261d9c582cb0 100644 --- a/src/transformers/models/dia/modular_dia.py +++ b/src/transformers/models/dia/modular_dia.py @@ -58,6 +58,7 @@ class DiaPreTrainedModel(PreTrainedModel): config: DiaConfig base_model_prefix = "model" + output_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index a65b4862c473..d42bf14257a8 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -579,6 +579,7 @@ class DinatPreTrainedModel(PreTrainedModel): config: DinatConfig base_model_prefix = "dinat" main_input_name = "pixel_values" + input_modalities = "image" def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py index 2bf76b9db35c..d0758e9820c5 100644 --- a/src/transformers/models/dinov2/modeling_dinov2.py +++ b/src/transformers/models/dinov2/modeling_dinov2.py @@ -423,6 +423,7 @@ class Dinov2PreTrainedModel(PreTrainedModel): config: Dinov2Config base_model_prefix = "dinov2" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["Dinov2Layer"] _supports_sdpa = True diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py index 5679f6bcf250..ca2f0407fd30 100644 --- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py @@ -440,6 +440,7 @@ class Dinov2WithRegistersPreTrainedModel(PreTrainedModel): config: Dinov2WithRegistersConfig base_model_prefix = "dinov2_with_registers" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["Dinov2WithRegistersLayer"] _supports_sdpa = True diff --git a/src/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py b/src/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py index 8eef42c03d17..cf40717fd025 100644 --- a/src/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +++ b/src/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py @@ -189,6 +189,7 @@ class DINOv3ConvNextPreTrainedModel(PreTrainedModel): config: DINOv3ConvNextConfig base_model_prefix = "dinov3_convnext" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["DINOv3ConvNextLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py b/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py index 99b9794c3235..4f63958dc6e3 100644 --- a/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py +++ b/src/transformers/models/dinov3_vit/modeling_dinov3_vit.py @@ -435,6 +435,7 @@ class DINOv3ViTPreTrainedModel(PreTrainedModel): config: DINOv3ViTConfig base_model_prefix = "dinov3_vit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["DINOv3ViTLayer"] _supports_sdpa = True diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index d388e386ae49..dcf5439294aa 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -804,6 +804,7 @@ class DonutSwinPreTrainedModel(PreTrainedModel): config: DonutSwinConfig base_model_prefix = "donut" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["DonutSwinStage"] diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index f302b8dc94ae..9fe99523c2f4 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -741,6 +741,7 @@ class DPTPreTrainedModel(PreTrainedModel): config: DPTConfig base_model_prefix = "dpt" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _supports_sdpa = True _supports_flash_attn = True diff --git a/src/transformers/models/edgetam/modeling_edgetam.py b/src/transformers/models/edgetam/modeling_edgetam.py index a3eef22dce55..ae2898657325 100644 --- a/src/transformers/models/edgetam/modeling_edgetam.py +++ b/src/transformers/models/edgetam/modeling_edgetam.py @@ -302,6 +302,7 @@ class EdgeTamPreTrainedModel(PreTrainedModel): config_class = EdgeTamConfig base_model_prefix = "edgetam" main_input_name = "pixel_values" + input_modalities = "image" _supports_sdpa = True _supports_flash_attn_2 = True _supports_attention_backend = True @@ -918,6 +919,7 @@ def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores): """ ) class EdgeTamModel(EdgeTamPreTrainedModel): + input_modalities = ["image", "text"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] # need to be ignored, as it's a buffer and will not be correctly detected as tied weight _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] diff --git a/src/transformers/models/edgetam_video/modeling_edgetam_video.py b/src/transformers/models/edgetam_video/modeling_edgetam_video.py index 3ba7ab4ebf2f..26814f4df931 100644 --- a/src/transformers/models/edgetam_video/modeling_edgetam_video.py +++ b/src/transformers/models/edgetam_video/modeling_edgetam_video.py @@ -773,6 +773,7 @@ class EdgeTamVideoPreTrainedModel(PreTrainedModel): config_class = EdgeTamVideoConfig base_model_prefix = "edgetam_video" main_input_name = "pixel_values" + input_modalities = "video" _supports_sdpa = True _supports_flash_attn_2 = True _supports_attention_backend = True @@ -1975,6 +1976,7 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000): @auto_docstring class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel): + input_modalities = ["video", "text"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] # need to be ignored, as it's a buffer and will not be correctly detected as tied weight _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] diff --git a/src/transformers/models/efficientloftr/modeling_efficientloftr.py b/src/transformers/models/efficientloftr/modeling_efficientloftr.py index 1fb93d48eb9b..c8c2351f5ea1 100644 --- a/src/transformers/models/efficientloftr/modeling_efficientloftr.py +++ b/src/transformers/models/efficientloftr/modeling_efficientloftr.py @@ -628,6 +628,7 @@ class EfficientLoFTRPreTrainedModel(PreTrainedModel): config_class = EfficientLoFTRConfig base_model_prefix = "efficientloftr" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index e368fefa0e79..0e35f791f9d2 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -433,6 +433,7 @@ class EfficientNetPreTrainedModel(PreTrainedModel): config: EfficientNetConfig base_model_prefix = "efficientnet" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [] def _init_weights(self, module: nn.Module): diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index ad070efc1d3e..8f5951b5eba8 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -928,6 +928,7 @@ class Emu3VQVAE(PreTrainedModel): config: Emu3VQVAEConfig base_model_prefix = "emuvideovq" main_input_name = "pixel_values" + input_modalities = "image" _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True @@ -1093,6 +1094,7 @@ def convert_bpe2img(self, img_batch: torch.Tensor) -> torch.Tensor: class Emu3PreTrainedModel(PreTrainedModel): config: Emu3Config base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = [ "Emu3DecoderLayer", @@ -1105,6 +1107,7 @@ class Emu3PreTrainedModel(PreTrainedModel): _supports_param_buffer_assignment = False _supports_flex_attn = True _supports_attention_backend = True + output_modalities = ["image", "text"] class Emu3RotaryEmbedding(nn.Module): @@ -1145,6 +1148,7 @@ def forward(self, x, position_ids): @auto_docstring class Emu3TextModel(Emu3PreTrainedModel): + output_modalities = "text" _can_record_outputs = { "hidden_states": Emu3DecoderLayer, "attentions": Emu3Attention, diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index 32599727b24c..ca804949a35b 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -678,6 +678,7 @@ class Emu3VQVAE(PreTrainedModel): config: Emu3VQVAEConfig base_model_prefix = "emuvideovq" main_input_name = "pixel_values" + input_modalities = "image" _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True @@ -840,6 +841,7 @@ def convert_bpe2img(self, img_batch: torch.Tensor) -> torch.Tensor: class Emu3PreTrainedModel(ChameleonPreTrainedModel, Emu3VQVAE): + output_modalities = ["image", "text"] _no_split_modules = [ "Emu3DecoderLayer", ] @@ -848,6 +850,7 @@ class Emu3PreTrainedModel(ChameleonPreTrainedModel, Emu3VQVAE): class Emu3TextModel(LlamaModel, Emu3PreTrainedModel): + output_modalities = "text" _can_record_outputs = { "hidden_states": Emu3DecoderLayer, "attentions": Emu3Attention, diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py index c3c32f5bd61d..64f35422bf64 100644 --- a/src/transformers/models/encodec/modeling_encodec.py +++ b/src/transformers/models/encodec/modeling_encodec.py @@ -480,6 +480,8 @@ def _init_weights(self, module): """ ) class EncodecModel(EncodecPreTrainedModel): + output_modalities = "audio" + def __init__(self, config: EncodecConfig): super().__init__(config) self.config = config diff --git a/src/transformers/models/eomt/modeling_eomt.py b/src/transformers/models/eomt/modeling_eomt.py index a593678e7950..ea27d9791f96 100644 --- a/src/transformers/models/eomt/modeling_eomt.py +++ b/src/transformers/models/eomt/modeling_eomt.py @@ -986,6 +986,7 @@ class EomtPreTrainedModel(PreTrainedModel): config: EomtConfig base_model_prefix = "eomt" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False _no_split_modules = ["EomtLayer"] _supports_sdpa = True diff --git a/src/transformers/models/eomt/modular_eomt.py b/src/transformers/models/eomt/modular_eomt.py index 7204a064e203..be66a7b7598d 100644 --- a/src/transformers/models/eomt/modular_eomt.py +++ b/src/transformers/models/eomt/modular_eomt.py @@ -392,6 +392,7 @@ class EomtPreTrainedModel(PreTrainedModel): config: EomtConfig base_model_prefix = "eomt" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False _no_split_modules = ["EomtLayer"] _supports_sdpa = True diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py index 5a2dc39385b3..f6b5aa988240 100644 --- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py @@ -980,6 +980,7 @@ class FastSpeech2ConformerPreTrainedModel(PreTrainedModel): base_model_prefix = "fastspeech2_conformer" main_input_name = "input_ids" + output_modalities = "audio" def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 5d63b5e132ad..75792a5682ac 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -683,6 +683,7 @@ def forward(self, hidden_states: torch.Tensor): class FlavaPreTrainedModel(PreTrainedModel): config: FlavaConfig base_model_prefix = "flava" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: @@ -718,6 +719,7 @@ class FlavaImageModel(FlavaPreTrainedModel): # This override allows us to load FlavaImageModel from FlavaModel/FlavaForPreTraining checkpoints. base_model_prefix = "flava.image_model" main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: FlavaImageConfig, add_pooling_layer: bool = True): r""" @@ -805,6 +807,7 @@ class FlavaTextModel(FlavaPreTrainedModel): config: FlavaTextConfig # This override allows us to load FlavaTextModel from FlavaModel/FlavaForPreTraining checkpoints. base_model_prefix = "flava.text_model" + input_modalities = "text" def __init__(self, config: FlavaTextConfig, add_pooling_layer: bool = True): r""" @@ -1351,6 +1354,7 @@ class FlavaImageCodebook(FlavaPreTrainedModel): base_model_prefix = "" config: FlavaImageCodebookConfig main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False def __init__( diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py index 9d1cb837e8ce..dcbbed5acf19 100644 --- a/src/transformers/models/florence2/modeling_florence2.py +++ b/src/transformers/models/florence2/modeling_florence2.py @@ -487,6 +487,7 @@ def forward(self, hidden_states: torch.Tensor): class Florence2VisionPreTrainedModel(PreTrainedModel): config_class = Florence2VisionConfig main_input_name = "pixel_values" + input_modalities = "image" _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True @@ -617,6 +618,7 @@ class Florence2Seq2SeqLMOutput(Seq2SeqLMOutput): class Florence2PreTrainedModel(PreTrainedModel): config: Florence2Config base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py index 64b2c307533c..2d307bcfeb92 100644 --- a/src/transformers/models/florence2/modular_florence2.py +++ b/src/transformers/models/florence2/modular_florence2.py @@ -1369,6 +1369,7 @@ def forward(self, hidden_states: torch.Tensor): class Florence2VisionPreTrainedModel(PreTrainedModel): config_class = Florence2VisionConfig main_input_name = "pixel_values" + input_modalities = "image" _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index 2095e9877c2c..fdacd7409615 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -35,6 +35,7 @@ class FuyuPreTrainedModel(PreTrainedModel): config: FuyuConfig base_model_prefix = "fuyu" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _supports_attention_backend = True _supports_flash_attn = True diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 8c3317373146..6b280fe0ed0c 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -429,6 +429,7 @@ class Gemma3PreTrainedModel(PreTrainedModel): "hidden_states": Gemma3DecoderLayer, "attentions": Gemma3Attention, } + input_modalities = ["image", "text"] def _init_weights(self, module): super()._init_weights(module) @@ -455,6 +456,7 @@ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: @auto_docstring class Gemma3TextModel(Gemma3PreTrainedModel): config: Gemma3TextConfig + input_modalities = "text" def __init__(self, config: Gemma3TextConfig): super().__init__(config) @@ -1338,6 +1340,7 @@ class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemm """ config: Gemma3TextConfig + input_modalities = "text" __all__ = [ diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 826b04a3898d..1aacd2706405 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -518,6 +518,7 @@ def forward( class Gemma3PreTrainedModel(Gemma2PreTrainedModel): base_model_prefix = "" + input_modalities = ["image", "text"] _no_split_modules = [ "Gemma3DecoderLayer", "SiglipVisionEmbeddings", @@ -549,6 +550,7 @@ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: class Gemma3TextModel(Gemma2Model): config: Gemma3TextConfig + input_modalities = "text" def __init__(self, config: Gemma3TextConfig): super().__init__(config) @@ -1157,6 +1159,7 @@ class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemm """ config: Gemma3TextConfig + input_modalities = "text" __all__ = [ diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 68595ead4371..a72db856f593 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -912,6 +912,7 @@ class Gemma3nAudioEncoder(PreTrainedModel): config: Gemma3nAudioConfig main_input_name = "audio_mel" + input_modalities = "audio" def __init__(self, config: Gemma3nAudioConfig): super().__init__(config) @@ -1494,6 +1495,7 @@ class Gemma3nPreTrainedModel(PreTrainedModel): "hidden_states": Gemma3nTextDecoderLayer, "attentions": Gemma3nTextAttention, } + input_modalities = ["image", "text", "audio"] def _init_weights(self, module): super()._init_weights(module) @@ -1508,6 +1510,7 @@ def _init_weights(self, module): @auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.") class Gemma3nTextModel(Gemma3nPreTrainedModel): config: Gemma3nTextConfig + input_modalities = "text" def __init__(self, config: Gemma3nTextConfig): super().__init__(config) diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 95e891d9ad2b..0562308321fb 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -1483,6 +1483,7 @@ class Gemma3nAudioEncoder(PreTrainedModel): config: Gemma3nAudioConfig main_input_name = "audio_mel" + input_modalities = "audio" def __init__(self, config: Gemma3nAudioConfig): super().__init__(config) @@ -1922,6 +1923,7 @@ def forward( class Gemma3nPreTrainedModel(Gemma2PreTrainedModel): config: Gemma3nConfig base_model_prefix = "" + input_modalities = ["image", "text", "audio"] _no_split_modules = ["Gemma3nTextDecoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index c1e823767135..86861ce3bf90 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -439,6 +439,7 @@ def forward( class GitPreTrainedModel(PreTrainedModel): config: GitConfig base_model_prefix = "git" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True def _init_weights(self, module): @@ -857,6 +858,7 @@ def forward( class GitVisionModel(GitPreTrainedModel): config: GitVisionConfig main_input_name = "pixel_values" + input_modalities = "image" # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP->Git def __init__(self, config: GitVisionConfig): diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index da1920811988..960851992a80 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -661,6 +661,7 @@ class Glm4vModelOutputWithPast(ModelOutput): class Glm4vPreTrainedModel(PreTrainedModel): config: Glm4vConfig base_model_prefix = "model" + input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"] _skip_keys_device_placement = "past_key_values" @@ -677,6 +678,7 @@ class Glm4vPreTrainedModel(PreTrainedModel): class Glm4vVisionModel(Glm4vPreTrainedModel): config: Glm4vVisionConfig + input_modalities = ["image", "video"] _no_split_modules = ["Glm4vVisionBlock"] def __init__(self, config) -> None: @@ -788,6 +790,7 @@ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch. @auto_docstring class Glm4vTextModel(Glm4vPreTrainedModel): config: Glm4vTextConfig + input_modalities = "text" def __init__(self, config: Glm4vTextConfig): super().__init__(config) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 2afc02923cb7..ec33d0b1aa9e 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -751,6 +751,7 @@ class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel): class Glm4vVisionModel(Glm4vPreTrainedModel): config: Glm4vVisionConfig + input_modalities = ["image", "video"] _no_split_modules = ["Glm4vVisionBlock"] def __init__(self, config) -> None: diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index 6d9ba48e99b2..2061ceaa7f4d 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -798,6 +798,7 @@ class Glm4vMoeModelOutputWithPast(ModelOutput): class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel): config: Glm4vMoeVisionConfig + input_modalities = ["image", "video"] _no_split_modules = ["Glm4vMoeVisionBlock"] def __init__(self, config) -> None: @@ -909,6 +910,7 @@ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch. @auto_docstring class Glm4vMoeTextModel(Glm4vMoePreTrainedModel): config: Glm4vMoeTextConfig + input_modalities = "text" def __init__(self, config: Glm4vMoeTextConfig): super().__init__(config) diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py index abdbcbf10e79..cbca84a7aabc 100755 --- a/src/transformers/models/glpn/modeling_glpn.py +++ b/src/transformers/models/glpn/modeling_glpn.py @@ -406,6 +406,7 @@ class GLPNPreTrainedModel(PreTrainedModel): config: GLPNConfig base_model_prefix = "glpn" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [] # Copied from transformers.models.segformer.modeling_segformer.SegformerPreTrainedModel._init_weights diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index eb22e62bfd7d..809926990d41 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -277,6 +277,7 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]: class GotOcr2PreTrainedModel(PreTrainedModel): config: GotOcr2Config base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" _supports_flash_attn = False @@ -399,6 +400,7 @@ def forward(self, hidden_states): class GotOcr2VisionEncoder(GotOcr2PreTrainedModel): _can_record_outputs = {"hidden_states": GotOcr2VisionLayer, "attentions": GotOcr2VisionAttention} + input_modalities = "image" def __init__(self, config: GotOcr2VisionConfig): super().__init__(config) diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py index 9c3bce47fff0..c85bbf78ba01 100644 --- a/src/transformers/models/got_ocr2/modular_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py @@ -248,11 +248,11 @@ def __init__(self, config, window_size): class GotOcr2PreTrainedModel(SamPreTrainedModel): - pass + input_modalities = ["image", "text"] class GotOcr2VisionEncoder(SamVisionEncoder, GotOcr2PreTrainedModel): - pass + input_modalities = "image" class GotOcr2MultiModalProjector(nn.Module): diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 1e44c9781dec..6973124fb51f 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -281,6 +281,7 @@ def forward(self, hidden_states: torch.Tensor): @auto_docstring class GraniteSpeechPreTrainedModel(PreTrainedModel): config: GraniteSpeechConfig + input_modalities = ["audio", "text"] _supports_flash_attn = False # `blip_2_qformer` dependency does not allow for this _supports_sdpa = True diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 594524c8dd1c..7a8ead1ad530 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1367,6 +1367,7 @@ class GroundingDinoPreTrainedModel(PreTrainedModel): config: GroundingDinoConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = ["image", "text"] def _init_weights(self, module): std = self.config.init_std diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 598845750da2..4c852db4668c 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -745,6 +745,7 @@ def forward( class GroupViTPreTrainedModel(PreTrainedModel): config: GroupViTConfig base_model_prefix = "groupvit" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True def _init_weights(self, module): @@ -1019,6 +1020,7 @@ def forward( class GroupViTTextModel(GroupViTPreTrainedModel): config: GroupViTTextConfig + input_modalities = "text" def __init__(self, config: GroupViTTextConfig): super().__init__(config) @@ -1123,6 +1125,7 @@ def forward( class GroupViTVisionModel(GroupViTPreTrainedModel): config: GroupViTVisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: GroupViTVisionConfig): super().__init__(config) diff --git a/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py b/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py index 1cd0e857afcd..2b412c8fa1dd 100644 --- a/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py +++ b/src/transformers/models/hgnet_v2/modeling_hgnet_v2.py @@ -42,6 +42,7 @@ class HGNetV2PreTrainedModel(PreTrainedModel): config: HGNetV2Config base_model_prefix = "hgnetv2" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["HGNetV2BasicLayer"] diff --git a/src/transformers/models/hgnet_v2/modular_hgnet_v2.py b/src/transformers/models/hgnet_v2/modular_hgnet_v2.py index 627c6aed6255..d07e3008da03 100644 --- a/src/transformers/models/hgnet_v2/modular_hgnet_v2.py +++ b/src/transformers/models/hgnet_v2/modular_hgnet_v2.py @@ -167,6 +167,7 @@ class HGNetV2PreTrainedModel(PreTrainedModel): config: HGNetV2Config base_model_prefix = "hgnetv2" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["HGNetV2BasicLayer"] diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 499c0b454600..464990fbce6f 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -773,6 +773,7 @@ class HieraPreTrainedModel(PreTrainedModel): config: HieraConfig base_model_prefix = "hiera" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True def _init_weights(self, module) -> None: diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index c792d0431444..fdafb85d4a21 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -670,6 +670,7 @@ class HubertPreTrainedModel(PreTrainedModel): config: HubertConfig base_model_prefix = "hubert" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py index d7169a85d30b..a0a7d805c973 100644 --- a/src/transformers/models/hubert/modular_hubert.py +++ b/src/transformers/models/hubert/modular_hubert.py @@ -128,6 +128,7 @@ class HubertPreTrainedModel(PreTrainedModel): config: HubertConfig base_model_prefix = "hubert" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index a97cbcb13030..87c8b1c9d45c 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -820,6 +820,7 @@ def forward( class IdeficsPreTrainedModel(PreTrainedModel): config: IdeficsConfig base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"] _supports_sdpa = True diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 9703a43d605c..63198c849c47 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -399,6 +399,7 @@ def forward( class Idefics2PreTrainedModel(PreTrainedModel): config: Idefics2Config base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Idefics2VisionAttention", "Idefics2MLP", "Idefics2PerceiverLayer", "Idefics2DecoderLayer"] _skip_keys_device_placement = "past_key_values" @@ -439,6 +440,7 @@ def _init_weights(self, module): ) class Idefics2VisionTransformer(Idefics2PreTrainedModel): config: Idefics2VisionConfig + input_modalities = "image" _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True @@ -697,6 +699,7 @@ def forward( ) class Idefics2PerceiverResampler(Idefics2PreTrainedModel): config: Idefics2PerceiverConfig + input_modalities = "image" _supports_sdpa = True _supports_flash_attention_2 = True _supports_flex_attn = True diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index 89bbd931fadc..0ddbb61c34b9 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -414,6 +414,7 @@ def forward(self, image_hidden_states): class Idefics3PreTrainedModel(PreTrainedModel): config: Idefics3Config base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Idefics3VisionAttention", "Idefics3DecoderLayer"] _skip_keys_device_placement = "past_key_values" @@ -448,6 +449,7 @@ def _init_weights(self, module): ) class Idefics3VisionTransformer(Idefics3PreTrainedModel): config: Idefics3VisionConfig + input_modalities = "image" _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True diff --git a/src/transformers/models/ijepa/modeling_ijepa.py b/src/transformers/models/ijepa/modeling_ijepa.py index 44df2898c309..89e7180f593d 100644 --- a/src/transformers/models/ijepa/modeling_ijepa.py +++ b/src/transformers/models/ijepa/modeling_ijepa.py @@ -332,6 +332,7 @@ class IJepaPreTrainedModel(PreTrainedModel): config: IJepaConfig base_model_prefix = "ijepa" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"] _supports_sdpa = True diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 7536b2812b28..a15b7ccb37db 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -379,6 +379,7 @@ class ImageGPTPreTrainedModel(PreTrainedModel): config: ImageGPTConfig base_model_prefix = "transformer" main_input_name = "input_ids" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["ImageGPTBlock"] diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index 1906870e0c69..610f355c1701 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -255,6 +255,7 @@ class InformerPreTrainedModel(PreTrainedModel): config: InformerConfig base_model_prefix = "model" main_input_name = "past_values" + input_modalities = "time" supports_gradient_checkpointing = True def _init_weights(self, module: nn.Module): diff --git a/src/transformers/models/informer/modular_informer.py b/src/transformers/models/informer/modular_informer.py index 955b463cd15e..3429a9c61e4c 100644 --- a/src/transformers/models/informer/modular_informer.py +++ b/src/transformers/models/informer/modular_informer.py @@ -96,6 +96,7 @@ class InformerPreTrainedModel(PreTrainedModel): config: InformerConfig base_model_prefix = "model" main_input_name = "past_values" + input_modalities = "time" supports_gradient_checkpointing = True def _init_weights(self, module: nn.Module): diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index db75024a9985..765d0516c6ac 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -307,6 +307,7 @@ def forward( class InstructBlipPreTrainedModel(PreTrainedModel): config: InstructBlipConfig base_model_prefix = "blip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _supports_attention_backend = True _supports_flash_attn = True @@ -378,6 +379,7 @@ def forward( # Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlip, BLIP->INSTRUCTBLIP class InstructBlipVisionModel(InstructBlipPreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" config: InstructBlipVisionConfig _can_record_outputs = { "hidden_states": InstructBlipEncoderLayer, diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 0c5b6bc25c61..a9141b5c8010 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -130,6 +130,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: boo class InstructBlipVideoPreTrainedModel(PreTrainedModel): config: InstructBlipVideoConfig base_model_prefix = "blip" + input_modalities = ["video", "text"] supports_gradient_checkpointing = True _supports_attention_backend = True _supports_flash_attn = True @@ -345,6 +346,7 @@ def forward( class InstructBlipVideoVisionModel(InstructBlipVideoPreTrainedModel): main_input_name = "pixel_values" + input_modalities = "video" config: InstructBlipVideoVisionConfig _can_record_outputs = { "hidden_states": InstructBlipVideoEncoderLayer, diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 3f96eb3f88af..fb8ebace01a1 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -176,11 +176,11 @@ def from_vision_qformer_text_configs( class InstructBlipVideoPreTrainedModel(InstructBlipPreTrainedModel): - pass + input_modalities = ["video", "text"] class InstructBlipVideoVisionModel(InstructBlipVisionModel): - pass + input_modalities = "video" class InstructBlipVideoQFormerModel(InstructBlipQFormerModel): diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index df66e8b14cf1..737288070ec5 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -397,6 +397,7 @@ class InternVLVisionPreTrainedModel(PreTrainedModel): config: InternVLVisionConfig base_model_prefix = "internvl_vision" main_input_name = "pixel_values" + input_modalities = ["image", "video"] supports_gradient_checkpointing = True _no_split_modules = ["InternVLVisionLayer"] _supports_sdpa = True @@ -470,6 +471,7 @@ def forward( class InternVLPreTrainedModel(PreTrainedModel): config: InternVLConfig base_model_prefix = "" + input_modalities = ["image", "text", "video"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/internvl/modular_internvl.py b/src/transformers/models/internvl/modular_internvl.py index d282de65a4b5..f5d26d3c4c0f 100644 --- a/src/transformers/models/internvl/modular_internvl.py +++ b/src/transformers/models/internvl/modular_internvl.py @@ -354,6 +354,7 @@ class InternVLVisionPreTrainedModel(PreTrainedModel): config: InternVLVisionConfig base_model_prefix = "internvl_vision" main_input_name = "pixel_values" + input_modalities = ["image", "video"] supports_gradient_checkpointing = True _no_split_modules = ["InternVLVisionLayer"] _supports_sdpa = True @@ -424,7 +425,7 @@ def forward( class InternVLPreTrainedModel(LlavaPreTrainedModel): - pass + input_modalities = ["image", "text", "video"] INTERNVL_INPUTS_DOCSTRING = None diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index 306d2c19ac18..ca373d2a92f7 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -48,6 +48,8 @@ class JanusPreTrainedModel(PreTrainedModel): config: JanusConfig base_model_prefix = "model" + input_modalities = ["image", "text"] + output_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer", "JanusVisionEncoderLayer"] _skip_keys_device_placement = ["past_key_values", "causal_mask"] @@ -544,6 +546,7 @@ def forward( @auto_docstring class JanusVisionModel(JanusPreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" config: JanusVisionConfig _can_record_outputs = { "hidden_states": JanusEncoderLayer, diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index a2f2541d84fa..79b5b8ecf06a 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -382,6 +382,8 @@ def __init__( class JanusPreTrainedModel(PreTrainedModel): config: JanusConfig base_model_prefix = "model" + input_modalities = ["image", "text"] + output_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer", "JanusVisionEncoderLayer"] _skip_keys_device_placement = ["past_key_values", "causal_mask"] diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index de6bc098f58c..083e2ccbddf8 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -1121,6 +1121,7 @@ def forward( @auto_docstring class Kosmos2PreTrainedModel(PreTrainedModel): config: Kosmos2Config + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Kosmos2VisionEncoderLayer", "Kosmos2TextBlock"] _supports_attention_backend = True @@ -1183,6 +1184,7 @@ def _init_weights(self, module: nn.Module): class Kosmos2VisionModel(Kosmos2PreTrainedModel): config: Kosmos2VisionConfig main_input_name = "pixel_values" + input_modalities = "image" # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2,self.vision_model->self.model def __init__(self, config: Kosmos2VisionConfig): @@ -1215,6 +1217,7 @@ def forward( class Kosmos2TextModel(Kosmos2PreTrainedModel): config: Kosmos2TextConfig + input_modalities = "text" def __init__(self, config: Kosmos2TextConfig): super().__init__(config) diff --git a/src/transformers/models/kosmos2_5/modeling_kosmos2_5.py b/src/transformers/models/kosmos2_5/modeling_kosmos2_5.py index bcc4a87a62a5..dd1023fbfd81 100644 --- a/src/transformers/models/kosmos2_5/modeling_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/modeling_kosmos2_5.py @@ -1218,6 +1218,7 @@ class Kosmos2_5PreTrainedModel(PreTrainedModel): """ config_class = Kosmos2_5Config + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Kosmos2_5VisionLayer", "Kosmos2_5TextBlock"] _supports_flash_attn_2 = True @@ -1252,6 +1253,7 @@ def _init_weights(self, module): class Kosmos2_5VisionModel(Kosmos2_5PreTrainedModel): config_class = Kosmos2_5VisionConfig + input_modalities = "text" # Copied from transformers.models.pix2struct.modeling_pix2struct.Pix2StructVisionModel.__init__ with Pix2Struct->Kosmos2_5 def __init__(self, config: Kosmos2_5VisionConfig): @@ -1322,6 +1324,7 @@ def forward( # Adapted from transformers.models.kosmos2.modeling_kosmos2.Kosmos2TextModel with KOSMOS2->KOSMOS2_5 class Kosmos2_5TextModel(Kosmos2_5PreTrainedModel): config_class = Kosmos2_5TextConfig + input_modalities = "text" def __init__(self, config: Kosmos2_5TextConfig): super().__init__(config) @@ -1507,6 +1510,7 @@ def forward( ) class Kosmos2_5TextForCausalLM(Kosmos2_5PreTrainedModel): config_class = Kosmos2_5TextConfig + input_modalities = "text" _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: Kosmos2_5TextConfig): diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index 77c636570d58..1191c6884c2a 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -116,6 +116,8 @@ def forward(self, x, layer_idx=None): class KyutaiSpeechToTextPreTrainedModel(PreTrainedModel): config: KyutaiSpeechToTextConfig base_model_prefix = "model" + input_modalities = ["audio", "text"] + output_modalities = ["audio", "text"] supports_gradient_checkpointing = True _no_split_modules = ["KyutaiSpeechToTextDecoderLayer", "MimiTransformerLayer"] _supports_flash_attn = True diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index 4df6a46cf88c..faf3979d1edb 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -456,6 +456,7 @@ def forward( class LayoutLMv2PreTrainedModel(PreTrainedModel): config: LayoutLMv2Config base_model_prefix = "layoutlmv2" + input_modalities = ["image", "text"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index f4c58096735a..aff9d1e982f3 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -201,6 +201,7 @@ def forward( class LayoutLMv3PreTrainedModel(PreTrainedModel): config: LayoutLMv3Config base_model_prefix = "layoutlmv3" + input_modalities = ["image", "text"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py index bec62dec56e0..5d331081721c 100644 --- a/src/transformers/models/levit/modeling_levit.py +++ b/src/transformers/models/levit/modeling_levit.py @@ -469,6 +469,7 @@ class LevitPreTrainedModel(PreTrainedModel): config: LevitConfig base_model_prefix = "levit" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["LevitResidualLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py index deee35394ee1..317786625ba8 100755 --- a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py @@ -77,6 +77,7 @@ def pixel_unshuffle(self, hidden_states: torch.Tensor): class Lfm2VlPreTrainedModel(PreTrainedModel): config: Lfm2VlConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/lightglue/modeling_lightglue.py b/src/transformers/models/lightglue/modeling_lightglue.py index 8e9faa3e4e04..0a76cb2f220c 100644 --- a/src/transformers/models/lightglue/modeling_lightglue.py +++ b/src/transformers/models/lightglue/modeling_lightglue.py @@ -424,6 +424,7 @@ class LightGluePreTrainedModel(PreTrainedModel): config: LightGlueConfig base_model_prefix = "lightglue" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py index 4018622a6899..9c536bd02e9c 100644 --- a/src/transformers/models/lightglue/modular_lightglue.py +++ b/src/transformers/models/lightglue/modular_lightglue.py @@ -582,6 +582,7 @@ class LightGluePreTrainedModel(PreTrainedModel): config: LightGlueConfig base_model_prefix = "lightglue" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index a974ed81ba2f..225291b73576 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -427,6 +427,7 @@ def forward( @auto_docstring class Llama4PreTrainedModel(PreTrainedModel): config: Llama4Config + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = ["past_key_values"] _supports_flash_attn = False @@ -467,6 +468,7 @@ def _init_weights(self, module): class Llama4TextModel(Llama4PreTrainedModel): _no_split_modules = ["Llama4TextDecoderLayer"] base_model_prefix = "model" + input_modalities = "text" config: Llama4TextConfig _can_record_outputs = { "attentions": Llama4TextAttention, @@ -1003,6 +1005,7 @@ def forward(self, hidden_states): class Llama4VisionModel(Llama4PreTrainedModel): base_model_prefix = "vision_model" + input_modalities = "image" _no_split_modules = ["Llama4VisionEncoderLayer"] config: Llama4VisionConfig diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index bc0bb0df7c7b..0ee351b03b54 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -111,6 +111,7 @@ def forward(self, image_features): class LlavaPreTrainedModel(PreTrainedModel): config: LlavaConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index a75b4b798107..7e01bbb385f8 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -223,6 +223,7 @@ def forward(self, image_features): class LlavaNextPreTrainedModel(PreTrainedModel): config: LlavaNextConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 9e3b15cea548..98b46e13f587 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -164,6 +164,7 @@ def forward(self, image_features): class LlavaNextVideoPreTrainedModel(PreTrainedModel): config: LlavaNextVideoConfig base_model_prefix = "" + input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 2ba202f668e9..92a3f51f8a71 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -25,6 +25,7 @@ LlavaNextModel, LlavaNextModelOutputWithPast, LlavaNextMultiModalProjector, + LlavaNextPreTrainedModel, TransformersKwargs, image_size_to_num_patches, ) @@ -258,6 +259,10 @@ class LlavaNextVideoMultiModalProjector(LlavaNextMultiModalProjector): pass +class LlavaNextVideoPreTrainedModel(LlavaNextPreTrainedModel): + input_modalities = ["image", "video", "text"] + + class LlavaNextVideoModel(LlavaNextModel): def __init__(self, config: LlavaNextVideoConfig, **super_kwargs): super().__init__(config, **super_kwargs) @@ -713,5 +718,5 @@ def prepare_inputs_for_generation( "LlavaNextVideoConfig", "LlavaNextVideoForConditionalGeneration", "LlavaNextVideoModel", - "LlavaNextVideoPreTrainedModel", # noqa: F822 + "LlavaNextVideoPreTrainedModel", ] diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 727655374574..4484d4647da1 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -105,6 +105,7 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput): class LlavaOnevisionPreTrainedModel(PreTrainedModel): config: LlavaOnevisionConfig base_model_prefix = "" + input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index c2183b6f41c8..08be81ae3c0e 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -679,6 +679,7 @@ def forward(self, sequence_output, pooled_output): class LxmertPreTrainedModel(PreTrainedModel): config: LxmertConfig base_model_prefix = "lxmert" + input_modalities = ["image", "text"] _supports_param_buffer_assignment = False def _init_weights(self, module): diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index 553700465f3c..278f977320ed 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -2100,6 +2100,7 @@ class Mask2FormerPreTrainedModel(PreTrainedModel): config: Mask2FormerConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" def _init_weights(self, module: nn.Module): xavier_std = self.config.init_xavier_std diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index b8ccbbb2fec4..bc961d2eb0ec 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -1434,6 +1434,7 @@ class MaskFormerPreTrainedModel(PreTrainedModel): config: MaskFormerConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" def _init_weights(self, module: nn.Module): xavier_std = self.config.init_xavier_std diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index 9d847e32624f..e46a6ec4f8cd 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -716,6 +716,7 @@ class MaskFormerSwinPreTrainedModel(PreTrainedModel): config: MaskFormerSwinConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["MaskFormerSwinStage"] diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index bdfa7661349a..8f3a61ae0d67 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -269,6 +269,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class MetaClip2PreTrainedModel(PreTrainedModel): config: MetaClip2Config base_model_prefix = "metaclip_2" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _supports_sdpa = True _supports_flash_attn = True @@ -567,6 +568,7 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel): ```""" config: MetaClip2TextConfig + input_modalities = "text" _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] _supports_flash_attn = False # mask creation only accounts for sdpa/eager @@ -669,6 +671,7 @@ class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel): ```""" config: MetaClip2TextConfig + input_modalities = "text" _supports_flash_attn = False _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] @@ -1128,6 +1131,7 @@ class MetaClip2VisionModel(MetaClip2PreTrainedModel): config: MetaClip2VisionConfig main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["MetaClip2EncoderLayer"] def __init__(self, config: MetaClip2VisionConfig): @@ -1234,6 +1238,7 @@ class MetaClip2VisionModelWithProjection(MetaClip2PreTrainedModel): config: MetaClip2VisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: MetaClip2VisionConfig): super().__init__(config) @@ -1303,6 +1308,7 @@ def forward( ) class MetaClip2ForImageClassification(MetaClip2PreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: MetaClip2Config) -> None: super().__init__(config) diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index cd42344bd406..7994c8b79b3c 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -223,6 +223,7 @@ class MetaClip2MLP(CLIPMLP): class MetaClip2PreTrainedModel(PreTrainedModel): config: MetaClip2Config base_model_prefix = "metaclip_2" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _supports_sdpa = True _supports_flash_attn = True diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index f22cad968247..ef4024e6d6b7 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -1361,6 +1361,7 @@ class MimiPreTrainedModel(PreTrainedModel): config: MimiConfig base_model_prefix = "mimi" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["MimiDecoderLayer"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 2c2a53a54352..b98efd38e824 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -177,6 +177,7 @@ class Mistral3ModelOutputWithPast(BaseModelOutputWithPast): class Mistral3PreTrainedModel(PreTrainedModel): config: Mistral3Config base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 5379c5313726..f55b61f16c71 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -553,6 +553,7 @@ def _init_weights(self, module): class MLCDVisionModel(MLCDPreTrainedModel): config: MLCDVisionConfig main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["MLCDEncoderLayer"] def __init__(self, config: MLCDVisionConfig): diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index 6c601ff6a20e..6f18ab7798e5 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -758,6 +758,7 @@ def forward(self, x, position_ids): class MllamaPreTrainedModel(PreTrainedModel): config: MllamaConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = [ "MllamaVisionEncoderLayer", @@ -943,6 +944,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( class MllamaVisionModel(MllamaPreTrainedModel): config: MllamaVisionConfig base_model_prefix = "vision_model" + input_modalities = "image" def __init__(self, config: MllamaVisionConfig): super().__init__(config) @@ -1140,6 +1142,7 @@ def forward( class MllamaTextModel(MllamaPreTrainedModel): config: MllamaTextConfig base_model_prefix = "language_model.model" + input_modalities = "text" def __init__(self, config: MllamaTextConfig): super().__init__(config) diff --git a/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py b/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py index b27d6ac42a3a..4f40fab19733 100644 --- a/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +++ b/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py @@ -504,6 +504,7 @@ class MMGroundingDinoPreTrainedModel(PreTrainedModel): config: MMGroundingDinoConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = ["image", "text"] def _init_weights(self, module): std = self.config.init_std diff --git a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py index f80c6977bf18..daaf5f6b72b7 100755 --- a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +++ b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py @@ -128,6 +128,7 @@ class MobileNetV1PreTrainedModel(PreTrainedModel): config: MobileNetV1Config base_model_prefix = "mobilenet_v1" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False _no_split_modules = [] diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index 2d30da8f756d..66975cd71b40 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -254,6 +254,7 @@ class MobileNetV2PreTrainedModel(PreTrainedModel): config: MobileNetV2Config base_model_prefix = "mobilenet_v2" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False _no_split_modules = [] diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py index db8b8cd58f5a..79eefa9b6e1a 100755 --- a/src/transformers/models/mobilevit/modeling_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_mobilevit.py @@ -623,6 +623,7 @@ class MobileViTPreTrainedModel(PreTrainedModel): config: MobileViTConfig base_model_prefix = "mobilevit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["MobileViTLayer"] diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py index d842acf7b6e5..5d07ba460877 100644 --- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py +++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py @@ -570,6 +570,7 @@ class MobileViTV2PreTrainedModel(PreTrainedModel): config: MobileViTV2Config base_model_prefix = "mobilevitv2" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["MobileViTV2Layer"] diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index ed0f879aa102..fbe1548928e1 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -463,6 +463,7 @@ class MoonshinePreTrainedModel(PreTrainedModel): config: MoonshineConfig base_model_prefix = "model" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["MoonshineEncoderLayer", "MoonshineDecoderLayer"] _supports_flash_attn = True diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index cadc770dea83..85ed218e3960 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -495,6 +495,7 @@ class MoonshinePreTrainedModel(PreTrainedModel): config: MoonshineConfig base_model_prefix = "model" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["MoonshineEncoderLayer", "MoonshineDecoderLayer"] _supports_flash_attn = True diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 7f685050930c..58a10ced34d8 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -805,6 +805,8 @@ def forward( class MoshiPreTrainedModel(PreTrainedModel): config: MoshiConfig base_model_prefix = "model" + input_modalities = ["audio", "text"] + output_modalities = ["audio", "text"] supports_gradient_checkpointing = True _no_split_modules = ["MoshiDecoderLayer", "MimiTransformerLayer"] _supports_flash_attn = True @@ -1459,6 +1461,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position( """ ) class MoshiForCausalLM(MoshiPreTrainedModel, GenerationMixin): + input_modalities = "text" + output_modalities = "text" _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"] # Copied from transformers.models.gemma.modeling_gemma.GemmaForCausalLM.__init__ with Gemma->Moshi diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 7326ede89e71..e80ed4784ee6 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -409,6 +409,7 @@ def forward( class MusicgenPreTrainedModel(PreTrainedModel): config: MusicgenDecoderConfig base_model_prefix = "model" + output_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["MusicgenDecoderLayer", "MusicgenAttention"] _supports_flash_attn = True diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 72466d743fd3..a4e92dbf4144 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -380,6 +380,7 @@ def forward( class MusicgenMelodyPreTrainedModel(PreTrainedModel): config: MusicgenMelodyDecoderConfig base_model_prefix = "model" + output_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["MusicgenMelodyDecoderLayer", "MusicgenMelodyAttention"] _supports_flash_attn = True diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py index 350cf8af1ab7..3c552a4b5cb5 100644 --- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py @@ -985,6 +985,7 @@ class OmDetTurboPreTrainedModel(PreTrainedModel): config: OmDetTurboConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = ["image", "text"] def _init_weights(self, module): def linear_init_(module_to_init): diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index 51c041d7b698..a6a15275ea58 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -2768,6 +2768,7 @@ class OneFormerPreTrainedModel(PreTrainedModel): config: OneFormerConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" def _init_weights(self, module: nn.Module): xavier_std = self.config.init_xavier_std diff --git a/src/transformers/models/ovis2/modeling_ovis2.py b/src/transformers/models/ovis2/modeling_ovis2.py index 75ff19ab9d14..146a91485606 100644 --- a/src/transformers/models/ovis2/modeling_ovis2.py +++ b/src/transformers/models/ovis2/modeling_ovis2.py @@ -417,6 +417,7 @@ def forward(self, visual_tokens: torch.Tensor) -> torch.Tensor: class Ovis2PreTrainedModel(PreTrainedModel): config: Ovis2Config base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Ovis2VisionAttention"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/ovis2/modular_ovis2.py b/src/transformers/models/ovis2/modular_ovis2.py index 09ce53703a15..f6277dc91a0a 100644 --- a/src/transformers/models/ovis2/modular_ovis2.py +++ b/src/transformers/models/ovis2/modular_ovis2.py @@ -147,6 +147,7 @@ def forward(self, visual_tokens: torch.Tensor) -> torch.Tensor: class Ovis2PreTrainedModel(PreTrainedModel): config: Ovis2Config base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Ovis2VisionAttention"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index 715df44f01f0..391470ccb1de 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -563,6 +563,7 @@ def forward( class Owlv2PreTrainedModel(PreTrainedModel): config: Owlv2Config base_model_prefix = "owlv2" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Owlv2EncoderLayer"] @@ -768,6 +769,7 @@ def forward( # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextModel with google/owlvit-base-patch32->google/owlv2-base-patch16, OWLVIT->OWLV2,OwlViT->Owlv2 class Owlv2TextModel(Owlv2PreTrainedModel): config: Owlv2TextConfig + input_modalities = "text" def __init__(self, config: Owlv2TextConfig): super().__init__(config) @@ -880,6 +882,7 @@ def forward( class Owlv2VisionModel(Owlv2PreTrainedModel): config: Owlv2VisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: Owlv2VisionConfig): super().__init__(config) diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 3971b1376d9c..0eb4ddbcd445 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -550,6 +550,7 @@ def forward( class OwlViTPreTrainedModel(PreTrainedModel): config: OwlViTConfig base_model_prefix = "owlvit" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["OwlViTEncoderLayer"] @@ -752,6 +753,7 @@ def forward( class OwlViTTextModel(OwlViTPreTrainedModel): config: OwlViTTextConfig + input_modalities = "text" def __init__(self, config: OwlViTTextConfig): super().__init__(config) @@ -862,6 +864,7 @@ def forward( class OwlViTVisionModel(OwlViTPreTrainedModel): config: OwlViTVisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: OwlViTVisionConfig): super().__init__(config) diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 2ffd2c1490cc..9a57747cabfc 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -206,6 +206,7 @@ def create_causal_mask_mapping( class PaliGemmaPreTrainedModel(PreTrainedModel): config: PaliGemmaConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["PaliGemmaMultiModalProjector"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/parakeet/modeling_parakeet.py b/src/transformers/models/parakeet/modeling_parakeet.py index 9ee029de30f2..3b21980199fb 100644 --- a/src/transformers/models/parakeet/modeling_parakeet.py +++ b/src/transformers/models/parakeet/modeling_parakeet.py @@ -419,6 +419,7 @@ class ParakeetPreTrainedModel(PreTrainedModel): config: ParakeetCTCConfig base_model_prefix = "model" main_input_name = "input_features" + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["ParakeetEncoderBlock"] _supports_flat_attention_mask = True diff --git a/src/transformers/models/parakeet/modular_parakeet.py b/src/transformers/models/parakeet/modular_parakeet.py index dc9c10c5dfb1..47773a373c5a 100644 --- a/src/transformers/models/parakeet/modular_parakeet.py +++ b/src/transformers/models/parakeet/modular_parakeet.py @@ -303,6 +303,7 @@ class ParakeetPreTrainedModel(PreTrainedModel): config: ParakeetCTCConfig base_model_prefix = "model" main_input_name = "input_features" + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["ParakeetEncoderBlock"] _supports_flat_attention_mask = True diff --git a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py index dd59ee37b203..67c4ba89091e 100644 --- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py +++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py @@ -678,6 +678,7 @@ class PatchTSMixerPreTrainedModel(PreTrainedModel): config: PatchTSMixerConfig base_model_prefix = "model" main_input_name = "past_values" + input_modalities = "time" supports_gradient_checkpointing = False def _init_weights(self, module): diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 055ea0cc2203..4a16d34b8795 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -548,6 +548,7 @@ class PatchTSTPreTrainedModel(PreTrainedModel): config: PatchTSTConfig base_model_prefix = "model" main_input_name = "past_values" + input_modalities = "time" supports_gradient_checkpointing = False def _init_weights(self, module: nn.Module): diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py index ddab1d412c32..f44489f50bfb 100755 --- a/src/transformers/models/perceiver/modeling_perceiver.py +++ b/src/transformers/models/perceiver/modeling_perceiver.py @@ -548,6 +548,7 @@ class PerceiverPreTrainedModel(PreTrainedModel): config: PerceiverConfig base_model_prefix = "perceiver" main_input_name = "inputs" + input_modalities = "image" # techinically can be anything but HF impl has only image processor def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py index 074e91e14e88..9fb7ede3e9f8 100644 --- a/src/transformers/models/perception_lm/modeling_perception_lm.py +++ b/src/transformers/models/perception_lm/modeling_perception_lm.py @@ -90,6 +90,7 @@ def forward(self, features): class PerceptionLMPreTrainedModel(PreTrainedModel): config: PerceptionLMConfig base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index d07200fc01f1..8c21832fccdb 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -308,6 +308,7 @@ def default_flax_embed_init(tensor): class Phi4MultimodalVisionPreTrainedModel(PreTrainedModel): config: Phi4MultimodalVisionConfig base_model_prefix = "phi4_vision" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["Phi4MultimodalVisionEncoderLayer"] @@ -931,6 +932,7 @@ def forward(self, x): @auto_docstring class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel): config: Phi4MultimodalAudioConfig + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["Phi4MultimodalAudioConformerEncoderLayer"] _supports_flash_attn = True @@ -1531,6 +1533,7 @@ class Phi4MultimodalPreTrainedModel(PreTrainedModel): "attentions": Phi4MultimodalAttention, } _version = "0.0.5" + input_modalities = ["image", "audio", "text"] def _init_weights(self, module): super()._init_weights(module) diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index 486d701a4311..9d9bc59d8fdb 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -537,6 +537,7 @@ def __init__(self, config: Phi4MultimodalVisionConfig): class Phi4MultimodalVisionPreTrainedModel(SiglipPreTrainedModel): config: Phi4MultimodalVisionConfig base_model_prefix = "phi4_vision" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["Phi4MultimodalVisionEncoderLayer"] @@ -1115,6 +1116,7 @@ def forward(self, x): @auto_docstring class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel): config: Phi4MultimodalAudioConfig + input_modalities = "audio" supports_gradient_checkpointing = True _no_split_modules = ["Phi4MultimodalAudioConformerEncoderLayer"] _supports_flash_attn = True @@ -1445,6 +1447,8 @@ class Phi4MultimodalRotaryEmbedding(Phi3RotaryEmbedding): class Phi4MultimodalPreTrainedModel(Phi3PreTrainedModel): + input_modalities = ["image", "audio", "text"] + def _init_weights(self, module): PreTrainedModel._init_weights(self, module) if isinstance(module, Phi4MultimodalImageEmbedding): diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index dfb95b5ccd5a..6c56af9a9cce 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -339,6 +339,7 @@ def forward( @auto_docstring class Pix2StructPreTrainedModel(PreTrainedModel): config: Pix2StructConfig + input_modalities = ["image", "text"] _can_compile_fullgraph = False @@ -463,6 +464,7 @@ def _shift_right(self, input_ids): class Pix2StructVisionModel(Pix2StructPreTrainedModel): config: Pix2StructVisionConfig main_input_name = "flattened_patches" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["Pix2StructVisionLayer"] @@ -982,6 +984,7 @@ def forward( ) class Pix2StructTextModel(Pix2StructPreTrainedModel): config: Pix2StructTextConfig + input_modalities = "text" _no_split_modules = ["Pix2StructTextBlock"] _tied_weights_keys = ["lm_head.weight"] supports_gradient_checkpointing = True diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py index 79bf0ee6bbda..7f46ab669eca 100644 --- a/src/transformers/models/pixtral/modeling_pixtral.py +++ b/src/transformers/models/pixtral/modeling_pixtral.py @@ -398,6 +398,7 @@ class PixtralPreTrainedModel(PreTrainedModel): config: PixtralVisionConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _supports_attention_backend = True _supports_flash_attn = True diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py index b7ee51991f94..a32b6dde21b5 100755 --- a/src/transformers/models/poolformer/modeling_poolformer.py +++ b/src/transformers/models/poolformer/modeling_poolformer.py @@ -242,6 +242,7 @@ class PoolFormerPreTrainedModel(PreTrainedModel): config: PoolFormerConfig base_model_prefix = "poolformer" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["PoolFormerLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index 94c2a7515a44..3aaddec65f66 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -566,6 +566,7 @@ def forward( class Pop2PianoPreTrainedModel(PreTrainedModel): config: Pop2PianoConfig base_model_prefix = "transformer" + output_modalities = "audio" supports_gradient_checkpointing = True _can_compile_fullgraph = False diff --git a/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py index 8f2dd052474d..cf9b260a0fa3 100644 --- a/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py @@ -243,6 +243,7 @@ class PromptDepthAnythingPreTrainedModel(PreTrainedModel): config: PromptDepthAnythingConfig base_model_prefix = "prompt_depth_anything" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True diff --git a/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py index 988143000ca1..33dfab832e71 100644 --- a/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py @@ -162,6 +162,7 @@ class PromptDepthAnythingPreTrainedModel(PreTrainedModel): config: PromptDepthAnythingConfig base_model_prefix = "prompt_depth_anything" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py index e77f0d5d748a..12c67455153c 100755 --- a/src/transformers/models/pvt/modeling_pvt.py +++ b/src/transformers/models/pvt/modeling_pvt.py @@ -438,6 +438,7 @@ class PvtPreTrainedModel(PreTrainedModel): config: PvtConfig base_model_prefix = "pvt" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [] def _init_weights(self, module: nn.Module) -> None: diff --git a/src/transformers/models/pvt_v2/modeling_pvt_v2.py b/src/transformers/models/pvt_v2/modeling_pvt_v2.py index a5c2f1e97d8d..6bc9e19c1b55 100644 --- a/src/transformers/models/pvt_v2/modeling_pvt_v2.py +++ b/src/transformers/models/pvt_v2/modeling_pvt_v2.py @@ -384,6 +384,7 @@ class PvtV2PreTrainedModel(PreTrainedModel): config: PvtV2Config base_model_prefix = "pvt_v2" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index c8b99164730e..dc0b80327116 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -64,6 +64,7 @@ class Qwen2_5OmniPreTrainedModel(PreTrainedModel): config: Qwen2_5OmniConfig base_model_prefix = "model" + input_modalities = ["image", "video", "audio", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Qwen2_5OmniDecoderLayer", "Qwen2_5OmniVisionBlock"] _skip_keys_device_placement = "past_key_values" @@ -74,6 +75,8 @@ class Qwen2_5OmniPreTrainedModel(PreTrainedModel): class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel): + input_modalities = ["image", "video", "audio", "text"] + def _prepare_4d_causal_attention_mask_with_cache_position( self, attention_mask: torch.Tensor, @@ -705,6 +708,7 @@ def forward(self, seqlen: int): class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniAudioEncoderConfig main_input_name = "input_features" + input_modalities = "audio" _no_split_modules = ["Qwen2_5OmniAudioEncoderLayer"] _supports_sdpa = True @@ -1071,6 +1075,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniVisionEncoderConfig _no_split_modules = ["Qwen2_5OmniVisionBlock"] + input_modalities = ["image", "video"] def __init__(self, config: Qwen2_5OmniVisionEncoderConfig, *inputs, **kwargs) -> None: super().__init__(config, *inputs, **kwargs) @@ -1500,6 +1505,7 @@ def forward( @auto_docstring class Qwen2_5OmniThinkerTextModel(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniTextConfig + input_modalities = "text" _no_split_modules = ["Qwen2_5OmniDecoderLayer"] def __init__(self, config: Qwen2_5OmniTextConfig): @@ -2076,6 +2082,8 @@ class Qwen2_5OmniTalkerCausalLMOutputWithPast(ModelOutput): @auto_docstring class Qwen2_5OmniTalkerModel(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniTalkerConfig + input_modalities = ["image", "video", "audio", "text"] + _no_split_modules = ["Qwen2_5OmniTalkerDecoderLayer"] def __init__(self, config: Qwen2_5OmniTalkerConfig): @@ -2234,6 +2242,7 @@ def forward( class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin): config: Qwen2_5OmniTalkerConfig base_model_prefix = "talker" + output_modalities = "audio" def __init__(self, config: Qwen2_5OmniTalkerConfig): super().__init__(config) @@ -3338,6 +3347,7 @@ def forward(self, hidden_states): ) class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniBigVGANConfig + input_modalities = "audio" def __init__(self, config: Qwen2_5OmniBigVGANConfig): super().__init__(config) @@ -3473,6 +3483,7 @@ def integrate(self, time_points): ) class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniDiTConfig + input_modalities = "audio" _no_split_modules = ["DiTDecoderLayer"] def __init__(self, config: Qwen2_5OmniDiTConfig): @@ -3629,6 +3640,7 @@ def ode_function(time_step, hidden_states): class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniToken2WavConfig base_model_prefix = "model" + input_modalities = "audio" _no_split_modules = ["Qwen2_5OmniToken2WavDiTModel", "Qwen2_5OmniToken2WavBigVGANModel"] def __init__(self, config: Qwen2_5OmniToken2WavConfig): @@ -3696,6 +3708,7 @@ def forward( ) class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, GenerationMixin): config: Qwen2_5OmniConfig + output_modalities = ["audio", "text"] _no_split_modules = [ "Qwen2_5OmniTalkerForConditionalGeneration", "Qwen2_5OmniToken2WavModel", diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 6620765bc83a..6c0f17ff3557 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -1124,10 +1124,13 @@ def get_text_config(self, *args, **kwargs): class Qwen2_5OmniPreTrainedModel(Qwen2_5_VLPreTrainedModel): config: Qwen2_5OmniConfig + input_modalities = ["image", "video", "audio", "text"] _can_compile_fullgraph = False class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel): + input_modalities = ["image", "video", "audio", "text"] + def _prepare_4d_causal_attention_mask_with_cache_position( self, attention_mask: torch.Tensor, @@ -1704,6 +1707,7 @@ def forward(self, seqlen: int): class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniAudioEncoderConfig main_input_name = "input_features" + input_modalities = "audio" _no_split_modules = ["Qwen2_5OmniAudioEncoderLayer"] _supports_sdpa = True @@ -1989,6 +1993,7 @@ def forward( class Qwen2_5OmniVisionEncoder(Qwen2_5_VisionTransformerPretrainedModel): config: Qwen2_5OmniVisionEncoderConfig + input_modalities = ["image", "video"] _no_split_modules = ["Qwen2_5OmniVisionBlock"] def __init__(self, config: Qwen2_5OmniVisionEncoderConfig, *inputs, **kwargs) -> None: @@ -2524,6 +2529,8 @@ class Qwen2_5OmniTalkerCausalLMOutputWithPast(ModelOutput): class Qwen2_5OmniTalkerModel(Qwen2_5_VLTextModel): config: Qwen2_5OmniTalkerConfig + input_modalities = ["image", "video", "audio", "text"] + _no_split_modules = ["Qwen2_5OmniTalkerDecoderLayer"] def __init__(self, config: Qwen2_5OmniTalkerConfig): @@ -2534,6 +2541,7 @@ def __init__(self, config: Qwen2_5OmniTalkerConfig): class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin): config: Qwen2_5OmniTalkerConfig base_model_prefix = "talker" + output_modalities = "audio" def __init__(self, config: Qwen2_5OmniTalkerConfig): super().__init__(config) @@ -3638,6 +3646,7 @@ def forward(self, hidden_states): ) class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniBigVGANConfig + input_modalities = "audio" def __init__(self, config: Qwen2_5OmniBigVGANConfig): super().__init__(config) @@ -3773,6 +3782,7 @@ def integrate(self, time_points): ) class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniDiTConfig + input_modalities = "audio" _no_split_modules = ["DiTDecoderLayer"] def __init__(self, config: Qwen2_5OmniDiTConfig): @@ -3929,6 +3939,7 @@ def ode_function(time_step, hidden_states): class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel): config: Qwen2_5OmniToken2WavConfig base_model_prefix = "model" + input_modalities = "audio" _no_split_modules = ["Qwen2_5OmniToken2WavDiTModel", "Qwen2_5OmniToken2WavBigVGANModel"] def __init__(self, config: Qwen2_5OmniToken2WavConfig): @@ -3996,6 +4007,7 @@ def forward( ) class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, GenerationMixin): config: Qwen2_5OmniConfig + output_modalities = ["audio", "text"] _no_split_modules = [ "Qwen2_5OmniTalkerForConditionalGeneration", "Qwen2_5OmniToken2WavModel", diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 76fdae589618..d805e34d6058 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -293,6 +293,7 @@ def forward( class Qwen2_5_VLPreTrainedModel(PreTrainedModel): config: Qwen2_5_VLConfig base_model_prefix = "model" + input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] _skip_keys_device_placement = "past_key_values" @@ -768,6 +769,7 @@ def forward( @auto_docstring class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel): config: Qwen2_5_VLTextConfig + input_modalities = "text" def __init__(self, config: Qwen2_5_VLTextConfig): super().__init__(config) diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index 226025c259bb..9208ee1930b5 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -248,6 +248,7 @@ def forward( class Qwen2AudioPreTrainedModel(PreTrainedModel): config: Qwen2AudioConfig base_model_prefix = "model" + input_modalities = ["audio", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Qwen2AudioAttention"] _skip_keys_device_placement = "past_key_values" @@ -294,6 +295,7 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel): # Ignore copy config: Qwen2AudioEncoderConfig main_input_name = "input_features" + input_modalities = "audio" _no_split_modules = ["Qwen2AudioEncoderLayer"] def __init__(self, config: Qwen2AudioEncoderConfig): diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 269f37492ad6..3ccb0addd0c9 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -629,6 +629,7 @@ def forward( class Qwen2VLPreTrainedModel(PreTrainedModel): config: Qwen2VLConfig base_model_prefix = "model" + input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"] _skip_keys_device_placement = "past_key_values" @@ -642,6 +643,7 @@ class Qwen2VLPreTrainedModel(PreTrainedModel): @auto_docstring class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel): config: Qwen2VLVisionConfig + input_modalities = ["image", "video"] _no_split_modules = ["Qwen2VLVisionBlock"] def __init__(self, config) -> None: @@ -739,6 +741,7 @@ def forward( @auto_docstring class Qwen2VLTextModel(Qwen2VLPreTrainedModel): config: Qwen2VLTextConfig + input_modalities = "text" def __init__(self, config: Qwen2VLTextConfig): super().__init__(config) diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index 796065d97561..734ef0a1d865 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -67,6 +67,7 @@ class Qwen3OmniMoePreTrainedModel(PreTrainedModel): config: Qwen3OmniMoeConfig base_model_prefix = "model" + input_modalities = ["image", "video", "audio", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Qwen3OmniMoeDecoderLayer", "Qwen3OmniMoeVisionBlock"] _skip_keys_device_placement = "past_key_values" @@ -88,6 +89,8 @@ def _get_feat_extract_output_lengths(input_lengths): class Qwen3OmniMoePreTrainedModelForConditionalGeneration(Qwen3OmniMoePreTrainedModel): + input_modalities = ["image", "video", "audio", "text"] + def _prepare_4d_causal_attention_mask_with_cache_position( self, attention_mask: torch.Tensor, @@ -634,6 +637,7 @@ def forward(self, seqlen: int): class Qwen3OmniMoeAudioEncoder(Qwen3OmniMoePreTrainedModel): config: Qwen3OmniMoeAudioEncoderConfig main_input_name = "input_features" + input_modalities = "audio" _no_split_modules = ["Qwen3OmniMoeAudioEncoderLayer"] _supports_sdpa = True @@ -3643,6 +3647,9 @@ def forward(self, hidden): class Qwen3OmniMoeCode2Wav(Qwen3OmniMoePreTrainedModel): + input_modalities = "audio" + output_modalities = "audio" + def __init__(self, config: Qwen3OmniMoeCode2WavConfig): super().__init__(config) self.total_upsample = np.prod(config.upsample_rates + config.upsampling_ratios) diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 504cbb2f3689..e408916787cf 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -2220,6 +2220,9 @@ def forward(self, hidden): class Qwen3OmniMoeCode2Wav(Qwen3OmniMoePreTrainedModel): + input_modalities = "audio" + output_modalities = "audio" + def __init__(self, config: Qwen3OmniMoeCode2WavConfig): super().__init__(config) self.total_upsample = np.prod(config.upsample_rates + config.upsampling_ratios) diff --git a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 6fa91b03cea8..4794dd38dcbb 100644 --- a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -547,6 +547,7 @@ class Qwen3VLModelOutputWithPast(ModelOutput): class Qwen3VLPreTrainedModel(PreTrainedModel): config: Qwen3VLConfig base_model_prefix = "model" + input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index 59a509fe03cd..801907aa1e63 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -247,6 +247,7 @@ class ResNetPreTrainedModel(PreTrainedModel): config: ResNetConfig base_model_prefix = "resnet" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["ResNetConvLayer", "ResNetShortCut"] def _init_weights(self, module): diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index 1d4b64496969..b056669764a0 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -1007,6 +1007,7 @@ class RTDetrPreTrainedModel(PreTrainedModel): config: RTDetrConfig base_model_prefix = "rt_detr" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [r"RTDetrHybridEncoder", r"RTDetrDecoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py index 21781dc3573f..b7e56abc170c 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -300,6 +300,7 @@ class RTDetrResNetPreTrainedModel(PreTrainedModel): config: RTDetrResNetConfig base_model_prefix = "resnet" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"] def _init_weights(self, module): diff --git a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py index 548a9378a2c0..6813fa465d5b 100644 --- a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py @@ -454,6 +454,7 @@ class RTDetrV2PreTrainedModel(PreTrainedModel): config: RTDetrV2Config base_model_prefix = "rt_detr_v2" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [r"RTDetrV2HybridEncoder", r"RTDetrV2DecoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index 034f7485f426..dfa5e426fe74 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -1008,6 +1008,7 @@ class SamPreTrainedModel(PreTrainedModel): config: SamConfig base_model_prefix = "sam" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["SamVisionAttention"] supports_gradient_checkpointing = True _supports_sdpa = True @@ -1110,6 +1111,7 @@ def forward( """ ) class SamModel(SamPreTrainedModel): + input_modalities = ["image", "text"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] # need to be ignored, as it's a buffer and will not be correctly detected as tied weight _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] diff --git a/src/transformers/models/sam2/modeling_sam2.py b/src/transformers/models/sam2/modeling_sam2.py index a1f5489fed0f..122b16043488 100644 --- a/src/transformers/models/sam2/modeling_sam2.py +++ b/src/transformers/models/sam2/modeling_sam2.py @@ -550,6 +550,7 @@ class Sam2PreTrainedModel(PreTrainedModel): config_class = Sam2Config base_model_prefix = "sam2" main_input_name = "pixel_values" + input_modalities = "image" _supports_sdpa = True _supports_flash_attn_2 = True _supports_attention_backend = True @@ -1275,6 +1276,7 @@ def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores): """ ) class Sam2Model(Sam2PreTrainedModel): + input_modalities = ["image", "text"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] # need to be ignored, as it's a buffer and will not be correctly detected as tied weight _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py index e6058db272fe..dbe7c1c11804 100644 --- a/src/transformers/models/sam2/modular_sam2.py +++ b/src/transformers/models/sam2/modular_sam2.py @@ -671,6 +671,7 @@ class Sam2PreTrainedModel(PreTrainedModel): config_class = Sam2Config base_model_prefix = "sam2" main_input_name = "pixel_values" + input_modalities = "image" _supports_sdpa = True _supports_flash_attn_2 = True _supports_attention_backend = True diff --git a/src/transformers/models/sam2_video/modeling_sam2_video.py b/src/transformers/models/sam2_video/modeling_sam2_video.py index 79d5b015f889..8f68297f238c 100644 --- a/src/transformers/models/sam2_video/modeling_sam2_video.py +++ b/src/transformers/models/sam2_video/modeling_sam2_video.py @@ -661,6 +661,7 @@ class Sam2VideoPreTrainedModel(PreTrainedModel): config_class = Sam2VideoConfig base_model_prefix = "sam2_video" main_input_name = "pixel_values" + input_modalities = "video" _supports_sdpa = True _supports_flash_attn_2 = True _supports_attention_backend = True @@ -1558,6 +1559,7 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000): @auto_docstring class Sam2VideoModel(Sam2VideoPreTrainedModel): + input_modalities = ["video", "text"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] # need to be ignored, as it's a buffer and will not be correctly detected as tied weight _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] diff --git a/src/transformers/models/sam2_video/modular_sam2_video.py b/src/transformers/models/sam2_video/modular_sam2_video.py index 091844f0aa1c..865cbcaaba44 100644 --- a/src/transformers/models/sam2_video/modular_sam2_video.py +++ b/src/transformers/models/sam2_video/modular_sam2_video.py @@ -986,6 +986,7 @@ class Sam2VideoPreTrainedModel(PreTrainedModel): config_class = Sam2VideoConfig base_model_prefix = "sam2_video" main_input_name = "pixel_values" + input_modalities = "video" _supports_sdpa = True _supports_flash_attn_2 = True _supports_attention_backend = True @@ -1447,6 +1448,7 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000): @auto_docstring class Sam2VideoModel(Sam2Model): + input_modalities = ["video", "text"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] # need to be ignored, as it's a buffer and will not be correctly detected as tied weight _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] diff --git a/src/transformers/models/sam_hq/modeling_sam_hq.py b/src/transformers/models/sam_hq/modeling_sam_hq.py index 5cc322184faa..20e39a0dec5f 100644 --- a/src/transformers/models/sam_hq/modeling_sam_hq.py +++ b/src/transformers/models/sam_hq/modeling_sam_hq.py @@ -423,6 +423,7 @@ class SamHQPreTrainedModel(PreTrainedModel): config: SamHQConfig base_model_prefix = "sam_hq" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = ["SamHQVisionAttention"] supports_gradient_checkpointing = True _supports_sdpa = True @@ -1233,6 +1234,7 @@ def forward( """ ) class SamHQModel(SamHQPreTrainedModel): + input_modalities = ["image", "text"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(SamHQTwoWayAttentionBlock, index=2)} diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index 44bfff010b31..5926473b718c 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -1452,6 +1452,7 @@ def compute_last_hidden_states_per_sample( ) class SeamlessM4TSpeechEncoder(SeamlessM4TPreTrainedModel): main_input_name = "input_features" + input_modalities = "audio" def __init__(self, config: SeamlessM4TConfig): super().__init__(config) @@ -2291,6 +2292,7 @@ def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor: class SeamlessM4TCodeHifiGan(PreTrainedModel): config: SeamlessM4TConfig main_input_name = "input_embeds" + input_modalities = "audio" _no_split_modules = [] def __init__(self, config): @@ -2715,6 +2717,7 @@ def generate( """ ) class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin): + input_modalities = "audio" _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"] main_input_name = "input_features" @@ -2976,6 +2979,7 @@ def generate( """ ) class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin): + output_modalities = "audio" _keys_to_ignore_on_load_missing = ["speech_encoder"] main_input_name = "input_ids" @@ -3299,6 +3303,8 @@ def generate( """ ) class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin): + input_modalities = "audio" + output_modalities = "audio" _keys_to_ignore_on_load_missing = ["text_encoder"] main_input_name = "input_features" @@ -3630,6 +3636,8 @@ def generate( """ ) class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin): + input_modalities = ["audio", "text"] + output_modalities = ["audio", "text"] _tied_weights_keys = [ "lm_head.weight", "text_encoder.embed_tokens.weight", diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py index 27146297884c..841892d90ece 100644 --- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -1492,6 +1492,7 @@ def _hard_upsample(self, hidden_states, durations): # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TSpeechEncoder with SeamlessM4T->SeamlessM4Tv2 class SeamlessM4Tv2SpeechEncoder(SeamlessM4Tv2PreTrainedModel): main_input_name = "input_features" + input_modalities = "audio" def __init__(self, config: SeamlessM4Tv2Config): super().__init__(config) @@ -2488,6 +2489,7 @@ def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor: class SeamlessM4Tv2CodeHifiGan(PreTrainedModel): config: SeamlessM4Tv2Config main_input_name = "input_embeds" + input_modalities = "audio" _no_split_modules = [] def __init__(self, config): @@ -2922,6 +2924,7 @@ def generate( """ ) class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + input_modalities = "audio" _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"] main_input_name = "input_features" @@ -3191,6 +3194,7 @@ def generate( """ ) class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + output_modalities = "audio" _keys_to_ignore_on_load_missing = ["speech_encoder"] main_input_name = "input_ids" @@ -3552,6 +3556,8 @@ def generate( """ ) class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + input_modalities = "audio" + output_modalities = "audio" _keys_to_ignore_on_load_missing = ["text_encoder"] main_input_name = "input_features" @@ -3920,6 +3926,8 @@ def generate( """ ) class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + input_modalities = ["audio", "text"] + output_modalities = ["audio", "text"] _tied_weights_keys = [ "lm_head.weight", "text_encoder.embed_tokens.weight", diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index 0b06c7c39e09..0bbf51563735 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -432,6 +432,7 @@ class SegformerPreTrainedModel(PreTrainedModel): config: SegformerConfig base_model_prefix = "segformer" main_input_name = "pixel_values" + input_modalities = "image" def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py index 1fde52bae079..73ba1ba10c5f 100644 --- a/src/transformers/models/seggpt/modeling_seggpt.py +++ b/src/transformers/models/seggpt/modeling_seggpt.py @@ -591,6 +591,7 @@ class SegGptPreTrainedModel(PreTrainedModel): config: SegGptConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["SegGptEmbeddings", "SegGptLayer"] diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 4a4cd9587979..2bfa4e98a083 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -508,6 +508,7 @@ class SEWPreTrainedModel(PreTrainedModel): config: SEWConfig base_model_prefix = "sew" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/sew/modular_sew.py b/src/transformers/models/sew/modular_sew.py index 4cf1464805a0..8a2cfc3a2689 100644 --- a/src/transformers/models/sew/modular_sew.py +++ b/src/transformers/models/sew/modular_sew.py @@ -249,6 +249,7 @@ class SEWPreTrainedModel(PreTrainedModel): config: SEWConfig base_model_prefix = "sew" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 99253578db5f..7dda40514663 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1184,6 +1184,7 @@ class SEWDPreTrainedModel(PreTrainedModel): config: SEWDConfig base_model_prefix = "sew-d" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py b/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py index 5e12b0129ab6..36fd972de140 100644 --- a/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py +++ b/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py @@ -44,6 +44,7 @@ class ShieldGemma2ImageClassifierOutputWithNoAttention(ImageClassifierOutputWith @auto_docstring class ShieldGemma2ForImageClassification(PreTrainedModel): config: ShieldGemma2Config + input_modalities = ["image", "text"] _checkpoint_conversion_mapping = { "model.language_model.model": "model.model.language_model", "model.vision_tower": "model.model.vision_tower", diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index 072ec9721f04..aace5e839e13 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -465,6 +465,7 @@ def forward( class SiglipPreTrainedModel(PreTrainedModel): config: SiglipConfig base_model_prefix = "siglip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = [ @@ -628,6 +629,7 @@ def forward( ) class SiglipTextModel(SiglipPreTrainedModel): config: SiglipTextConfig + input_modalities = "text" def __init__(self, config: SiglipTextConfig): super().__init__(config) @@ -745,6 +747,7 @@ def forward(self, hidden_state): class SiglipVisionModel(SiglipPreTrainedModel): config: SiglipVisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: SiglipVisionConfig): super().__init__(config) @@ -1004,6 +1007,7 @@ def forward( ) class SiglipForImageClassification(SiglipPreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: SiglipConfig) -> None: super().__init__(config) diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py index 4fbaf32dff90..d2273d557d15 100644 --- a/src/transformers/models/siglip2/modeling_siglip2.py +++ b/src/transformers/models/siglip2/modeling_siglip2.py @@ -540,6 +540,7 @@ def default_flax_embed_init(tensor): class Siglip2PreTrainedModel(PreTrainedModel): config: Siglip2Config base_model_prefix = "siglip2" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = [ @@ -708,6 +709,7 @@ def forward( ) class Siglip2TextModel(Siglip2PreTrainedModel): config: Siglip2TextConfig + input_modalities = "text" def __init__(self, config: Siglip2TextConfig): super().__init__(config) @@ -794,6 +796,7 @@ def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Ten class Siglip2VisionModel(Siglip2PreTrainedModel): config: Siglip2VisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: Siglip2VisionConfig): super().__init__(config) @@ -1083,6 +1086,7 @@ def forward( ) class Siglip2ForImageClassification(Siglip2PreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: Siglip2Config) -> None: super().__init__(config) diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index 5ff2b041dd2d..ca21b0ca0c9b 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -72,6 +72,7 @@ def extra_repr(self): class SmolVLMPreTrainedModel(PreTrainedModel): config: SmolVLMConfig base_model_prefix = "model" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["SmolVLMVisionAttention", "SmolVLMDecoderLayer"] _skip_keys_device_placement = "past_key_values" @@ -340,6 +341,7 @@ def forward( ) class SmolVLMVisionTransformer(SmolVLMPreTrainedModel): config: SmolVLMVisionConfig + input_modalities = "image" _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index 2fb5160ed90b..5db0e2ff2605 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -65,6 +65,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): config: SpeechEncoderDecoderConfig base_model_prefix = "speech_encoder_decoder" main_input_name = "inputs" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_param_buffer_assignment = False _supports_flash_attn = True diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 43b6511b2314..188f68ef83c9 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1119,6 +1119,7 @@ def forward( """ ) class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel, GenerationMixin): + input_modalities = ["audio", "text"] base_model_prefix = "model" _tied_weights_keys = ["lm_head.weight"] diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index b6f74b527ffb..fe7ac2da65b6 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -1177,6 +1177,7 @@ class SpeechT5PreTrainedModel(PreTrainedModel): config: SpeechT5Config base_model_prefix = "speecht5" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True def _init_weights(self, module: nn.Module): @@ -2331,6 +2332,8 @@ def _generate_speech( """ ) class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel): + input_modalities = "text" + output_modalities = "audio" main_input_name = "input_ids" def __init__(self, config: SpeechT5Config): @@ -2685,6 +2688,8 @@ def generate_speech( """ ) class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel): + output_modalities = ["audio", "text"] + def __init__(self, config: SpeechT5Config): super().__init__(config) diff --git a/src/transformers/models/superglue/modeling_superglue.py b/src/transformers/models/superglue/modeling_superglue.py index 0f2f86799b7f..80c07a373392 100644 --- a/src/transformers/models/superglue/modeling_superglue.py +++ b/src/transformers/models/superglue/modeling_superglue.py @@ -513,6 +513,7 @@ class SuperGluePreTrainedModel(PreTrainedModel): config: SuperGlueConfig base_model_prefix = "superglue" main_input_name = "pixel_values" + input_modalities = "image" def _init_weights(self, module: nn.Module) -> None: """Initialize the weights""" diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py index f75cc6f9bb8f..c211705aaefd 100644 --- a/src/transformers/models/superpoint/modeling_superpoint.py +++ b/src/transformers/models/superpoint/modeling_superpoint.py @@ -325,6 +325,7 @@ class SuperPointPreTrainedModel(PreTrainedModel): config: SuperPointConfig base_model_prefix = "superpoint" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: diff --git a/src/transformers/models/swiftformer/modeling_swiftformer.py b/src/transformers/models/swiftformer/modeling_swiftformer.py index 7ecd94a8fd52..9eed87cd4166 100644 --- a/src/transformers/models/swiftformer/modeling_swiftformer.py +++ b/src/transformers/models/swiftformer/modeling_swiftformer.py @@ -384,6 +384,7 @@ class SwiftFormerPreTrainedModel(PreTrainedModel): config: SwiftFormerConfig base_model_prefix = "swiftformer" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["SwiftFormerEncoderBlock"] diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index c9fdc0d7d044..7d8a85ba79ea 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -826,6 +826,7 @@ class SwinPreTrainedModel(PreTrainedModel): config: SwinConfig base_model_prefix = "swin" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["SwinStage"] diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index 4dfb5b4f743e..a5ce9fcb1c9b 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -707,6 +707,7 @@ class Swin2SRPreTrainedModel(PreTrainedModel): config: Swin2SRConfig base_model_prefix = "swin2sr" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index 33be714f96b3..7af4cfeb2ae6 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -901,6 +901,7 @@ class Swinv2PreTrainedModel(PreTrainedModel): config: Swinv2Config base_model_prefix = "swinv2" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["Swinv2Stage"] diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 4ab85689ab15..90e687b14ffd 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -687,6 +687,7 @@ class TableTransformerPreTrainedModel(PreTrainedModel): config: TableTransformerConfig base_model_prefix = "model" main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [ r"TableTransformerConvEncoder", r"TableTransformerEncoderLayer", diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py index 5a7c6fac3e10..5f8c71a250cf 100644 --- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py @@ -615,6 +615,7 @@ class TimeSeriesTransformerPreTrainedModel(PreTrainedModel): config: TimeSeriesTransformerConfig base_model_prefix = "model" main_input_name = "past_values" + input_modalities = "time" supports_gradient_checkpointing = True # TODO: tests would need a rewrite to check for correct implementation # Current tests always assume certain inputs to be passed diff --git a/src/transformers/models/timesfm/modeling_timesfm.py b/src/transformers/models/timesfm/modeling_timesfm.py index dd6f352376ff..f3396482c96e 100644 --- a/src/transformers/models/timesfm/modeling_timesfm.py +++ b/src/transformers/models/timesfm/modeling_timesfm.py @@ -303,6 +303,7 @@ class TimesFmPreTrainedModel(PreTrainedModel): base_model_prefix = "timesfm" _no_split_modules = ["TimesFmDecoderLayer"] main_input_name = "past_values" + input_modalities = "time" _supports_sdpa = True def _init_weights(self, module): diff --git a/src/transformers/models/timesfm/modular_timesfm.py b/src/transformers/models/timesfm/modular_timesfm.py index b82816e7c737..2e48fcf3fc8e 100644 --- a/src/transformers/models/timesfm/modular_timesfm.py +++ b/src/transformers/models/timesfm/modular_timesfm.py @@ -259,6 +259,7 @@ class TimesFmPreTrainedModel(PreTrainedModel): base_model_prefix = "timesfm" _no_split_modules = ["TimesFmDecoderLayer"] main_input_name = "past_values" + input_modalities = "time" _supports_sdpa = True def _init_weights(self, module): diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py index 1c125dac4f32..18ed043e6915 100644 --- a/src/transformers/models/timesformer/modeling_timesformer.py +++ b/src/transformers/models/timesformer/modeling_timesformer.py @@ -451,6 +451,7 @@ class TimesformerPreTrainedModel(PreTrainedModel): config: TimesformerConfig base_model_prefix = "timesformer" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["TimesformerLayer"] diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py index 4959f76d588a..d446fc96f71b 100644 --- a/src/transformers/models/timm_backbone/modeling_timm_backbone.py +++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py @@ -39,6 +39,7 @@ class TimmBackbone(PreTrainedModel, BackboneMixin): """ main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = False config: TimmBackboneConfig diff --git a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py index d388ff05297f..970349054697 100644 --- a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py +++ b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py @@ -80,6 +80,7 @@ def _create_timm_model_with_error_handling(config: "TimmWrapperConfig", **model_ @auto_docstring class TimmWrapperPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" + input_modalities = "image" config: TimmWrapperConfig _no_split_modules = [] model_tags = ["timm"] diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py index eb6e3da17b38..02e4d339ff2b 100644 --- a/src/transformers/models/tvp/modeling_tvp.py +++ b/src/transformers/models/tvp/modeling_tvp.py @@ -544,6 +544,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class TvpPreTrainedModel(PreTrainedModel): config: TvpConfig base_model_prefix = "model" + input_modalities = ["video", "text"] supports_gradient_checkpointing = True def _init_weights(self, module: nn.Module): diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 58de9c10b117..9b41195b8169 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -253,6 +253,7 @@ def forward(self, pixel_values): class UdopPreTrainedModel(PreTrainedModel): config: UdopConfig base_model_prefix = "transformer" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _can_compile_fullgraph = False diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 2359ad1c9512..8d3ea75ca2b7 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -772,6 +772,7 @@ class UniSpeechPreTrainedModel(PreTrainedModel): config: UniSpeechConfig base_model_prefix = "unispeech" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/unispeech/modular_unispeech.py b/src/transformers/models/unispeech/modular_unispeech.py index 900079b7bb9b..534490235db1 100644 --- a/src/transformers/models/unispeech/modular_unispeech.py +++ b/src/transformers/models/unispeech/modular_unispeech.py @@ -141,6 +141,7 @@ class UniSpeechPreTrainedModel(PreTrainedModel): config: UniSpeechConfig base_model_prefix = "unispeech" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index f880c960556b..086a5daecfba 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -777,6 +777,7 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel): config: UniSpeechSatConfig base_model_prefix = "unispeech_sat" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py index 3e1d99939215..e209c7c18ea3 100644 --- a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py @@ -153,6 +153,7 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel): config: UniSpeechSatConfig base_model_prefix = "unispeech_sat" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py index e7595ff38f8a..048d68e7276a 100644 --- a/src/transformers/models/univnet/modeling_univnet.py +++ b/src/transformers/models/univnet/modeling_univnet.py @@ -427,6 +427,7 @@ def remove_weight_norm(self): class UnivNetModel(PreTrainedModel): config: UnivNetConfig main_input_name = "input_features" + input_modalities = "audio" def __init__(self, config: UnivNetConfig): super().__init__(config) diff --git a/src/transformers/models/upernet/modeling_upernet.py b/src/transformers/models/upernet/modeling_upernet.py index 36dc90c30adb..5c9521766379 100644 --- a/src/transformers/models/upernet/modeling_upernet.py +++ b/src/transformers/models/upernet/modeling_upernet.py @@ -269,6 +269,7 @@ def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor: class UperNetPreTrainedModel(PreTrainedModel): config: UperNetConfig main_input_name = "pixel_values" + input_modalities = "image" _no_split_modules = [] def _init_weights(self, module): diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 2db424455087..3f874c2e9353 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -125,6 +125,7 @@ def forward(self, image_features): class VideoLlavaPreTrainedModel(PreTrainedModel): config: VideoLlavaConfig base_model_prefix = "" + input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["VideoLlavaVisionAttention"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py index 497e4f47b9e1..5cc221d8b1b5 100755 --- a/src/transformers/models/videomae/modeling_videomae.py +++ b/src/transformers/models/videomae/modeling_videomae.py @@ -400,6 +400,7 @@ class VideoMAEPreTrainedModel(PreTrainedModel): config: VideoMAEConfig base_model_prefix = "videomae" main_input_name = "pixel_values" + input_modalities = "video" supports_gradient_checkpointing = True _no_split_modules = ["VideoMAEEmbeddings", "VideoMAELayer"] _supports_sdpa = True diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 386883969916..9ab4a6f91b3d 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -532,6 +532,7 @@ def forward( class ViltPreTrainedModel(PreTrainedModel): config: ViltConfig base_model_prefix = "vilt" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["ViltEmbeddings", "ViltSelfAttention"] diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index f9a376120d63..16606f8ccf4d 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -114,6 +114,7 @@ def forward(self, hidden_states): class VipLlavaPreTrainedModel(PreTrainedModel): config: VipLlavaConfig base_model_prefix = "" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index 62e44a365f89..ff748b0939da 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -64,6 +64,7 @@ class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin): config: VisionEncoderDecoderConfig base_model_prefix = "vision_encoder_decoder" main_input_name = "pixel_values" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True _supports_param_buffer_assignment = False _supports_flash_attn = True diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index 50d0c433cfce..0f7f86bb1458 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -47,6 +47,7 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor: class VisionTextDualEncoderModel(PreTrainedModel): config: VisionTextDualEncoderConfig base_model_prefix = "vision_text_dual_encoder" + input_modalities = ["image", "text"] _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index 4ee8d2701738..888ca0b2cb0c 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -483,6 +483,7 @@ def forward(self, sequence_output, pooled_output): class VisualBertPreTrainedModel(PreTrainedModel): config: VisualBertConfig base_model_prefix = "visual_bert" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index 0aad13ca896a..b071f664c569 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -372,6 +372,7 @@ class ViTPreTrainedModel(PreTrainedModel): config: ViTConfig base_model_prefix = "vit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["ViTEmbeddings", "ViTLayer"] _supports_sdpa = True diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 98e1579a1ca1..db9ec96390e3 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -539,6 +539,7 @@ class ViTMAEPreTrainedModel(PreTrainedModel): config: ViTMAEConfig base_model_prefix = "vit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _supports_sdpa = True _supports_flash_attn = True diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py index c7a409578a1a..aa6af5bbbbd2 100644 --- a/src/transformers/models/vit_msn/modeling_vit_msn.py +++ b/src/transformers/models/vit_msn/modeling_vit_msn.py @@ -376,6 +376,7 @@ class ViTMSNPreTrainedModel(PreTrainedModel): config: ViTMSNConfig base_model_prefix = "vit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["ViTMSNAttention", "ViTMSNSdpaAttention"] _supports_sdpa = True diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py index d5b38e0c48ae..6505c57f6802 100644 --- a/src/transformers/models/vitdet/modeling_vitdet.py +++ b/src/transformers/models/vitdet/modeling_vitdet.py @@ -589,6 +589,7 @@ class VitDetPreTrainedModel(PreTrainedModel): config: VitDetConfig base_model_prefix = "vitdet" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = [] diff --git a/src/transformers/models/vitmatte/modeling_vitmatte.py b/src/transformers/models/vitmatte/modeling_vitmatte.py index 50cec9c153d3..8863056c5190 100644 --- a/src/transformers/models/vitmatte/modeling_vitmatte.py +++ b/src/transformers/models/vitmatte/modeling_vitmatte.py @@ -54,6 +54,7 @@ class ImageMattingOutput(ModelOutput): class VitMattePreTrainedModel(PreTrainedModel): config: VitMatteConfig main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = [] diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py index 250341797aab..247e7b47ccec 100644 --- a/src/transformers/models/vitpose/modeling_vitpose.py +++ b/src/transformers/models/vitpose/modeling_vitpose.py @@ -63,6 +63,7 @@ class VitPosePreTrainedModel(PreTrainedModel): config: VitPoseConfig base_model_prefix = "vit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]): diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py index cf786a7263a5..fd684ec672ed 100644 --- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py @@ -367,6 +367,7 @@ class VitPoseBackbonePreTrainedModel(PreTrainedModel): config: VitPoseBackboneConfig base_model_prefix = "vit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = ["VitPoseBackboneEmbeddings", "VitPoseBackboneLayer"] _supports_sdpa = True diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py index bae8d44e0d13..e02ace525dd6 100644 --- a/src/transformers/models/vits/modeling_vits.py +++ b/src/transformers/models/vits/modeling_vits.py @@ -1199,6 +1199,7 @@ class VitsPreTrainedModel(PreTrainedModel): config: VitsConfig base_model_prefix = "vits" main_input_name = "input_ids" + output_modalities = "audio" supports_gradient_checkpointing = True def _init_weights(self, module: nn.Module): diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py index 75d58393324a..7b77d5e79467 100755 --- a/src/transformers/models/vivit/modeling_vivit.py +++ b/src/transformers/models/vivit/modeling_vivit.py @@ -383,6 +383,7 @@ class VivitPreTrainedModel(PreTrainedModel): config: VivitConfig base_model_prefix = "vivit" main_input_name = "pixel_values" + input_modalities = "video" supports_gradient_checkpointing = True _no_split_modules = [] _supports_sdpa = True diff --git a/src/transformers/models/vjepa2/modeling_vjepa2.py b/src/transformers/models/vjepa2/modeling_vjepa2.py index 0b309610aec7..cd2ce529d547 100644 --- a/src/transformers/models/vjepa2/modeling_vjepa2.py +++ b/src/transformers/models/vjepa2/modeling_vjepa2.py @@ -933,6 +933,7 @@ class VJEPA2PreTrainedModel(PreTrainedModel): config: VJEPA2Config base_model_prefix = "vjepa2" main_input_name = "pixel_values_videos" + input_modalities = "video" supports_gradient_checkpointing = True _no_split_modules = [ "VJEPA2Layer", diff --git a/src/transformers/models/voxtral/modeling_voxtral.py b/src/transformers/models/voxtral/modeling_voxtral.py index ce790148d826..3d2ecf31b18b 100644 --- a/src/transformers/models/voxtral/modeling_voxtral.py +++ b/src/transformers/models/voxtral/modeling_voxtral.py @@ -219,6 +219,7 @@ def forward( class VoxtralPreTrainedModel(PreTrainedModel): config: VoxtralConfig base_model_prefix = "model" + input_modalities = ["audio", "text"] supports_gradient_checkpointing = True _no_split_modules = None _skip_keys_device_placement = "past_key_values" @@ -268,6 +269,7 @@ class VoxtralEncoder(VoxtralPreTrainedModel): # Ignore copy config: VoxtralEncoderConfig main_input_name = "input_features" + input_modalities = "audio" _no_split_modules = ["VoxtralEncoderLayer"] _can_record_outputs = { "attentions": VoxtralAttention, diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index c3fcfd3b8fb5..44b3837039ff 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1016,6 +1016,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel): config: Wav2Vec2Config base_model_prefix = "wav2vec2" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = True _supports_sdpa = True diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py index 3448089c632b..c8593d38d131 100644 --- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py @@ -708,6 +708,7 @@ class Wav2Vec2BertPreTrainedModel(PreTrainedModel): config: Wav2Vec2BertConfig base_model_prefix = "wav2vec2_bert" main_input_name = "input_features" + input_modalities = "audio" supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py index 79f70da7cb84..3bce99771f55 100644 --- a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py @@ -580,6 +580,7 @@ class Wav2Vec2BertPreTrainedModel(PreTrainedModel): config: Wav2Vec2BertConfig base_model_prefix = "wav2vec2_bert" main_input_name = "input_features" + input_modalities = "audio" supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 62357c8e0dcb..7d95f06b70c6 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -848,6 +848,7 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): config: Wav2Vec2ConformerConfig base_model_prefix = "wav2vec2_conformer" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py index bfa6c20737d8..7a0e757a8496 100644 --- a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py @@ -547,6 +547,7 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): config: Wav2Vec2ConformerConfig base_model_prefix = "wav2vec2_conformer" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index a34b5d61d71a..274d83fa8914 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -597,6 +597,7 @@ class WavLMPreTrainedModel(PreTrainedModel): config: WavLMConfig base_model_prefix = "wavlm" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = False _supports_sdpa = False diff --git a/src/transformers/models/wavlm/modular_wavlm.py b/src/transformers/models/wavlm/modular_wavlm.py index 75e360b6a1d3..4020f0b3335b 100644 --- a/src/transformers/models/wavlm/modular_wavlm.py +++ b/src/transformers/models/wavlm/modular_wavlm.py @@ -507,6 +507,7 @@ class WavLMPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel): config: WavLMConfig base_model_prefix = "wavlm" main_input_name = "input_values" + input_modalities = "audio" supports_gradient_checkpointing = True _supports_flash_attn = False _supports_sdpa = False diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index acbe3fa77b17..1b67f16cce0e 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -531,6 +531,7 @@ class WhisperPreTrainedModel(PreTrainedModel): config: WhisperConfig base_model_prefix = "model" main_input_name = "input_features" + input_modalities = ["audio", "text"] supports_gradient_checkpointing = True _no_split_modules = ["WhisperEncoderLayer", "WhisperDecoderLayer"] _supports_flash_attn = True diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 3c906a85392e..98691dc9dcb4 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -500,6 +500,7 @@ def forward( class XCLIPPreTrainedModel(PreTrainedModel): config: XCLIPConfig base_model_prefix = "x_clip" + input_modalities = ["image", "text"] supports_gradient_checkpointing = True def _init_weights(self, module): @@ -710,6 +711,7 @@ def forward( class XCLIPTextModel(XCLIPPreTrainedModel): config: XCLIPTextConfig + input_modalities = "text" def __init__(self, config: XCLIPTextConfig): super().__init__(config) @@ -904,6 +906,7 @@ def forward( class XCLIPVisionModel(XCLIPPreTrainedModel): config: XCLIPVisionConfig main_input_name = "pixel_values" + input_modalities = "image" def __init__(self, config: XCLIPVisionConfig): super().__init__(config) diff --git a/src/transformers/models/xcodec/modeling_xcodec.py b/src/transformers/models/xcodec/modeling_xcodec.py index 4e1d376a3d08..774f9c74b8de 100644 --- a/src/transformers/models/xcodec/modeling_xcodec.py +++ b/src/transformers/models/xcodec/modeling_xcodec.py @@ -325,6 +325,7 @@ class XcodecPreTrainedModel(PreTrainedAudioTokenizerBase): config_class = XcodecConfig base_model_prefix = "xcodec" main_input_name = "input_values" + input_modalities = "audio" def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index 1a06517f76e1..ec0091e4692c 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -453,6 +453,7 @@ class YolosPreTrainedModel(PreTrainedModel): config: YolosConfig base_model_prefix = "vit" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True _no_split_modules = [] _supports_sdpa = True diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py index a88a444bf928..eb2cc630c021 100644 --- a/src/transformers/models/zoedepth/modeling_zoedepth.py +++ b/src/transformers/models/zoedepth/modeling_zoedepth.py @@ -1208,6 +1208,7 @@ class ZoeDepthPreTrainedModel(PreTrainedModel): config: ZoeDepthConfig base_model_prefix = "zoedepth" main_input_name = "pixel_values" + input_modalities = "image" supports_gradient_checkpointing = True def _init_weights(self, module):