@@ -583,13 +583,14 @@ def get_model_tokenizer_qwen2_vl(*args, **kwargs):
583583 kwargs ['automodel_class' ] = kwargs ['automodel_class' ] or Qwen2VLForConditionalGeneration
584584 model , tokenizer = get_model_tokenizer_multimodal (* args , ** kwargs )
585585 if model is not None :
586- if hasattr (model .model , 'embed_tokens' ):
587- embed_tokens = model .model .embed_tokens
586+ base_model = model .model if 'AWQ' in model .__class__ .__name__ else model
587+ if hasattr (base_model .model , 'embed_tokens' ):
588+ embed_tokens = base_model .model .embed_tokens
588589 else :
589- embed_tokens = model .model .language_model .embed_tokens
590+ embed_tokens = base_model .model .language_model .embed_tokens
590591 patch_output_clone (embed_tokens )
591592 patch_output_to_input_device (embed_tokens )
592- patch_get_input_embeddings (model .visual , 'patch_embed' )
593+ patch_get_input_embeddings (base_model .visual , 'patch_embed' )
593594
594595 from qwen_vl_utils import vision_process
595596 patch_qwen_vl_utils (vision_process )
@@ -712,10 +713,11 @@ def get_model_tokenizer_qwen2_5_omni(model_dir, *args, **kwargs):
712713 kwargs ['model_config' ].enable_audio_output = get_env_args ('ENABLE_AUDIO_OUTPUT' , bool , True )
713714 model , _ = get_model_tokenizer_with_flash_attn (model_dir , * args , ** kwargs )
714715 if model :
715- use_submodel_func (model , 'thinker' )
716- model .config .keys_to_ignore_at_inference += ['hidden_states' , 'attention_mask' ]
717- model .config .talker_config .pad_token_id = None
718- patch_get_input_embeddings (model .thinker .visual , 'patch_embed' )
716+ base_model = model .model if 'AWQ' in model .__class__ .__name__ else model
717+ use_submodel_func (base_model , 'thinker' )
718+ base_model .config .keys_to_ignore_at_inference += ['hidden_states' , 'attention_mask' ]
719+ base_model .config .talker_config .pad_token_id = None
720+ patch_get_input_embeddings (base_model .thinker .visual , 'patch_embed' )
719721 return model , processor
720722
721723
0 commit comments