60
60
{ MODEL_NAME_TO_TYPE = }
61
61
"""
62
62
63
- __all__ = ["get_model_type" ]
63
+ __all__ = ["get_model_type" , "is_multimodal_model" ]
64
64
65
65
66
66
def get_model_type (model ):
@@ -69,3 +69,43 @@ def get_model_type(model):
69
69
if k .lower () in type (model ).__name__ .lower ():
70
70
return v
71
71
return None
72
+
73
+
74
+ def is_multimodal_model (model ):
75
+ """Check if a model is a Vision-Language Model (VLM) or multimodal model.
76
+
77
+ This function detects various multimodal model architectures by checking for:
78
+ - Standard vision configurations (vision_config)
79
+ - Language model attributes (language_model)
80
+ - Specific multimodal model types (phi4mm)
81
+ - Vision LoRA configurations
82
+ - Audio processing capabilities
83
+ - Image embedding layers
84
+
85
+ Args:
86
+ model: The HuggingFace model instance to check
87
+
88
+ Returns:
89
+ bool: True if the model is detected as multimodal, False otherwise
90
+
91
+ Examples:
92
+ >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
93
+ >>> is_multimodal_model(model)
94
+ True
95
+
96
+ >>> model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-multimodal-instruct")
97
+ >>> is_multimodal_model(model)
98
+ True
99
+ """
100
+ config = model .config
101
+
102
+ return (
103
+ hasattr (config , "vision_config" ) # Standard vision config (e.g., Qwen2.5-VL)
104
+ or hasattr (model , "language_model" ) # Language model attribute (e.g., LLaVA)
105
+ or getattr (config , "model_type" , "" ) == "phi4mm" # Phi-4 multimodal
106
+ or hasattr (config , "vision_lora" ) # Vision LoRA configurations
107
+ or hasattr (config , "audio_processor" ) # Audio processing capabilities
108
+ or (
109
+ hasattr (config , "embd_layer" ) and hasattr (config .embd_layer , "image_embd_layer" )
110
+ ) # Image embedding layers
111
+ )
0 commit comments