@@ -517,6 +517,16 @@ def bytes_to_unicode():
517517# output in the same directory as the model if output_dir is None
518518dir_model = args .model_dir
519519
520+ # Read config.json to get actual model configuration
521+ config_path = os .path .join (dir_model , "config.json" )
522+ model_config = {}
523+ if os .path .isfile (config_path ):
524+ with open (config_path , "r" , encoding = "utf-8" ) as f :
525+ model_config = json .load (f )
526+ print (f"Loaded config from { config_path } " )
527+ else :
528+ print (f"Warning: config.json not found at { config_path } " )
529+
520530# If minicpmv_projector is not specified but the default path exists, use the default path
521531if args .minicpmv_projector is None :
522532 default_projector_path = os .path .join (dir_model , "minicpmv.projector" )
@@ -555,37 +565,62 @@ def bytes_to_unicode():
555565# processor = CLIPProcessor.from_pretrained(dir_model)
556566
557567minicpmv_version = args .minicpmv_version
558- emb_dim = 4096
559- block_count = 26
560- if minicpmv_version == 1 : # MiniCPM-V 2.0
561- emb_dim = 2304
562- block_count = 26
563- elif minicpmv_version == 2 : # MiniCPM-V 2.5
564- emb_dim = 4096
565- block_count = 27
566- elif minicpmv_version == 3 : # MiniCPM-V 2.6
567- emb_dim = 3584
568- block_count = 27
569- elif minicpmv_version == 4 : # MiniCPM-o 2.6
570- emb_dim = 3584
571- block_count = 27
572- elif minicpmv_version == 5 : # MiniCPM-V 4.0
573- emb_dim = 2560
574- block_count = 27
575-
576- default_vision_config = {
577- "hidden_size" : 1152 ,
578- "image_size" : 980 ,
579- "intermediate_size" : 4304 ,
580- "model_type" : "idefics2" ,
581- "num_attention_heads" : 16 ,
582- "num_hidden_layers" : 27 ,
583- "patch_size" : 14 ,
568+
569+ # Use actual config values instead of hardcoded ones
570+ if model_config :
571+ # For the projector/resampler, use the main model's hidden_size
572+ emb_dim = model_config .get ("hidden_size" , 1536 )
573+
574+ # For the vision model, use vision_config values
575+ vision_config_dict = model_config .get ("vision_config" , {})
576+ default_vision_config = {
577+ "hidden_size" : vision_config_dict .get ("hidden_size" , 1152 ),
578+ "image_size" : vision_config_dict .get ("image_size" , 980 ),
579+ "intermediate_size" : vision_config_dict .get ("intermediate_size" , 4304 ),
580+ "model_type" : vision_config_dict .get ("model_type" , "siglip" ),
581+ "num_attention_heads" : vision_config_dict .get ("num_attention_heads" , 16 ),
582+ "num_hidden_layers" : vision_config_dict .get ("num_hidden_layers" , 27 ),
583+ "patch_size" : vision_config_dict .get ("patch_size" , 14 ),
584584 }
585585
586+ # Use vision model's num_hidden_layers for block_count
587+ block_count = vision_config_dict .get ("num_hidden_layers" , 27 )
588+
589+ print (f"Using config values: emb_dim={ emb_dim } , block_count={ block_count } " )
590+ print (f"Vision config: { default_vision_config } " )
591+ else :
592+ # Fallback to original hardcoded logic if config.json not found
593+ emb_dim = 4096
594+ block_count = 26
595+ if minicpmv_version == 1 :
596+ emb_dim = 2304
597+ block_count = 26
598+ elif minicpmv_version == 2 :
599+ emb_dim = 4096
600+ block_count = 27
601+ elif minicpmv_version == 3 :
602+ emb_dim = 3584
603+ block_count = 27
604+ elif minicpmv_version == 4 :
605+ emb_dim = 3584
606+ block_count = 27
607+ elif minicpmv_version == 5 :
608+ emb_dim = 2560
609+ block_count = 27
610+
611+ default_vision_config = {
612+ "hidden_size" : 1152 ,
613+ "image_size" : 980 ,
614+ "intermediate_size" : 4304 ,
615+ "model_type" : "idefics2" ,
616+ "num_attention_heads" : 16 ,
617+ "num_hidden_layers" : 27 ,
618+ "patch_size" : 14 ,
619+ }
620+
586621vision_config = Idefics2VisionConfig (** default_vision_config )
587622model = Idefics2VisionTransformer (vision_config )
588- if minicpmv_version == 3 :
623+ if minicpmv_version == 3 or ( model_config and model_config . get ( "vision_config" , {}). get ( "model_type" ) == "siglip" ) :
589624 vision_config = SiglipVisionConfig (** default_vision_config )
590625 model = SiglipVisionTransformer (vision_config )
591626elif minicpmv_version == 4 :
@@ -644,16 +679,27 @@ def bytes_to_unicode():
644679 fout .add_description ("two-tower CLIP model" )
645680
646681if has_vision_encoder :
647- # vision_model hparams
648- fout .add_uint32 ("clip.vision.image_size" , 448 )
649- fout .add_uint32 ("clip.vision.patch_size" , 14 )
650- fout .add_uint32 (add_key_str (KEY_EMBEDDING_LENGTH , VISION ), 1152 )
651- fout .add_uint32 (add_key_str (KEY_FEED_FORWARD_LENGTH , VISION ), 4304 )
682+ # vision_model hparams - use actual config values
683+ vision_image_size = model_config .get ("image_size" , 448 ) if model_config else 448
684+ vision_patch_size = default_vision_config .get ("patch_size" , 14 )
685+ vision_hidden_size = default_vision_config .get ("hidden_size" , 1152 )
686+ vision_intermediate_size = default_vision_config .get ("intermediate_size" , 4304 )
687+ vision_attention_heads = default_vision_config .get ("num_attention_heads" , 16 )
688+
689+ fout .add_uint32 ("clip.vision.image_size" , vision_image_size )
690+ fout .add_uint32 ("clip.vision.patch_size" , vision_patch_size )
691+ fout .add_uint32 (add_key_str (KEY_EMBEDDING_LENGTH , VISION ), vision_hidden_size )
692+ fout .add_uint32 (add_key_str (KEY_FEED_FORWARD_LENGTH , VISION ), vision_intermediate_size )
652693 fout .add_uint32 ("clip.vision.projection_dim" , 0 )
653- fout .add_uint32 (add_key_str (KEY_ATTENTION_HEAD_COUNT , VISION ), 16 )
694+ fout .add_uint32 (add_key_str (KEY_ATTENTION_HEAD_COUNT , VISION ), vision_attention_heads )
654695 fout .add_float32 (add_key_str (KEY_ATTENTION_LAYERNORM_EPS , VISION ), 1e-6 )
655696 fout .add_uint32 (add_key_str (KEY_BLOCK_COUNT , VISION ), block_count )
656697
698+ # Add MiniCPM-V specific parameters
699+ query_num = model_config .get ("query_num" , 0 ) if model_config else 0
700+ resampler_emb_dim = model_config .get ("hidden_size" , 0 ) if model_config else 0
701+ fout .add_uint32 ("clip.minicpmv_query_num" , query_num )
702+
657703 if processor is not None :
658704 image_mean = processor .image_processor .image_mean if args .image_mean is None or args .image_mean == default_image_mean else args .image_mean
659705 image_std = processor .image_processor .image_std if args .image_std is None or args .image_std == default_image_std else args .image_std
0 commit comments