@@ -517,6 +517,16 @@ def bytes_to_unicode():
517
517
# output in the same directory as the model if output_dir is None
518
518
dir_model = args .model_dir
519
519
520
+ # Read config.json to get actual model configuration
521
+ config_path = os .path .join (dir_model , "config.json" )
522
+ model_config = {}
523
+ if os .path .isfile (config_path ):
524
+ with open (config_path , "r" , encoding = "utf-8" ) as f :
525
+ model_config = json .load (f )
526
+ print (f"Loaded config from { config_path } " )
527
+ else :
528
+ print (f"Warning: config.json not found at { config_path } " )
529
+
520
530
# If minicpmv_projector is not specified but the default path exists, use the default path
521
531
if args .minicpmv_projector is None :
522
532
default_projector_path = os .path .join (dir_model , "minicpmv.projector" )
@@ -555,37 +565,62 @@ def bytes_to_unicode():
555
565
# processor = CLIPProcessor.from_pretrained(dir_model)
556
566
557
567
minicpmv_version = args .minicpmv_version
558
- emb_dim = 4096
559
- block_count = 26
560
- if minicpmv_version == 1 : # MiniCPM-V 2.0
561
- emb_dim = 2304
562
- block_count = 26
563
- elif minicpmv_version == 2 : # MiniCPM-V 2.5
564
- emb_dim = 4096
565
- block_count = 27
566
- elif minicpmv_version == 3 : # MiniCPM-V 2.6
567
- emb_dim = 3584
568
- block_count = 27
569
- elif minicpmv_version == 4 : # MiniCPM-o 2.6
570
- emb_dim = 3584
571
- block_count = 27
572
- elif minicpmv_version == 5 : # MiniCPM-V 4.0
573
- emb_dim = 2560
574
- block_count = 27
575
-
576
- default_vision_config = {
577
- "hidden_size" : 1152 ,
578
- "image_size" : 980 ,
579
- "intermediate_size" : 4304 ,
580
- "model_type" : "idefics2" ,
581
- "num_attention_heads" : 16 ,
582
- "num_hidden_layers" : 27 ,
583
- "patch_size" : 14 ,
568
+
569
+ # Use actual config values instead of hardcoded ones
570
+ if model_config :
571
+ # For the projector/resampler, use the main model's hidden_size
572
+ emb_dim = model_config .get ("hidden_size" , 1536 )
573
+
574
+ # For the vision model, use vision_config values
575
+ vision_config_dict = model_config .get ("vision_config" , {})
576
+ default_vision_config = {
577
+ "hidden_size" : vision_config_dict .get ("hidden_size" , 1152 ),
578
+ "image_size" : vision_config_dict .get ("image_size" , 980 ),
579
+ "intermediate_size" : vision_config_dict .get ("intermediate_size" , 4304 ),
580
+ "model_type" : vision_config_dict .get ("model_type" , "siglip" ),
581
+ "num_attention_heads" : vision_config_dict .get ("num_attention_heads" , 16 ),
582
+ "num_hidden_layers" : vision_config_dict .get ("num_hidden_layers" , 27 ),
583
+ "patch_size" : vision_config_dict .get ("patch_size" , 14 ),
584
584
}
585
585
586
+ # Use vision model's num_hidden_layers for block_count
587
+ block_count = vision_config_dict .get ("num_hidden_layers" , 27 )
588
+
589
+ print (f"Using config values: emb_dim={ emb_dim } , block_count={ block_count } " )
590
+ print (f"Vision config: { default_vision_config } " )
591
+ else :
592
+ # Fallback to original hardcoded logic if config.json not found
593
+ emb_dim = 4096
594
+ block_count = 26
595
+ if minicpmv_version == 1 :
596
+ emb_dim = 2304
597
+ block_count = 26
598
+ elif minicpmv_version == 2 :
599
+ emb_dim = 4096
600
+ block_count = 27
601
+ elif minicpmv_version == 3 :
602
+ emb_dim = 3584
603
+ block_count = 27
604
+ elif minicpmv_version == 4 :
605
+ emb_dim = 3584
606
+ block_count = 27
607
+ elif minicpmv_version == 5 :
608
+ emb_dim = 2560
609
+ block_count = 27
610
+
611
+ default_vision_config = {
612
+ "hidden_size" : 1152 ,
613
+ "image_size" : 980 ,
614
+ "intermediate_size" : 4304 ,
615
+ "model_type" : "idefics2" ,
616
+ "num_attention_heads" : 16 ,
617
+ "num_hidden_layers" : 27 ,
618
+ "patch_size" : 14 ,
619
+ }
620
+
586
621
vision_config = Idefics2VisionConfig (** default_vision_config )
587
622
model = Idefics2VisionTransformer (vision_config )
588
- if minicpmv_version == 3 :
623
+ if minicpmv_version == 3 or ( model_config and model_config . get ( "vision_config" , {}). get ( "model_type" ) == "siglip" ) :
589
624
vision_config = SiglipVisionConfig (** default_vision_config )
590
625
model = SiglipVisionTransformer (vision_config )
591
626
elif minicpmv_version == 4 :
@@ -644,16 +679,27 @@ def bytes_to_unicode():
644
679
fout .add_description ("two-tower CLIP model" )
645
680
646
681
if has_vision_encoder :
647
- # vision_model hparams
648
- fout .add_uint32 ("clip.vision.image_size" , 448 )
649
- fout .add_uint32 ("clip.vision.patch_size" , 14 )
650
- fout .add_uint32 (add_key_str (KEY_EMBEDDING_LENGTH , VISION ), 1152 )
651
- fout .add_uint32 (add_key_str (KEY_FEED_FORWARD_LENGTH , VISION ), 4304 )
682
+ # vision_model hparams - use actual config values
683
+ vision_image_size = model_config .get ("image_size" , 448 ) if model_config else 448
684
+ vision_patch_size = default_vision_config .get ("patch_size" , 14 )
685
+ vision_hidden_size = default_vision_config .get ("hidden_size" , 1152 )
686
+ vision_intermediate_size = default_vision_config .get ("intermediate_size" , 4304 )
687
+ vision_attention_heads = default_vision_config .get ("num_attention_heads" , 16 )
688
+
689
+ fout .add_uint32 ("clip.vision.image_size" , vision_image_size )
690
+ fout .add_uint32 ("clip.vision.patch_size" , vision_patch_size )
691
+ fout .add_uint32 (add_key_str (KEY_EMBEDDING_LENGTH , VISION ), vision_hidden_size )
692
+ fout .add_uint32 (add_key_str (KEY_FEED_FORWARD_LENGTH , VISION ), vision_intermediate_size )
652
693
fout .add_uint32 ("clip.vision.projection_dim" , 0 )
653
- fout .add_uint32 (add_key_str (KEY_ATTENTION_HEAD_COUNT , VISION ), 16 )
694
+ fout .add_uint32 (add_key_str (KEY_ATTENTION_HEAD_COUNT , VISION ), vision_attention_heads )
654
695
fout .add_float32 (add_key_str (KEY_ATTENTION_LAYERNORM_EPS , VISION ), 1e-6 )
655
696
fout .add_uint32 (add_key_str (KEY_BLOCK_COUNT , VISION ), block_count )
656
697
698
+ # Add MiniCPM-V specific parameters
699
+ query_num = model_config .get ("query_num" , 0 ) if model_config else 0
700
+ resampler_emb_dim = model_config .get ("hidden_size" , 0 ) if model_config else 0
701
+ fout .add_uint32 ("clip.minicpmv_query_num" , query_num )
702
+
657
703
if processor is not None :
658
704
image_mean = processor .image_processor .image_mean if args .image_mean is None or args .image_mean == default_image_mean else args .image_mean
659
705
image_std = processor .image_processor .image_std if args .image_std is None or args .image_std == default_image_std else args .image_std
0 commit comments