16
16
import os
17
17
from typing import Union
18
18
19
- from ...configuration_utils import PretrainedConfig
19
+ from ...configuration_utils import PretrainedConfig , layer_type_validation
20
+ from ...modeling_rope_utils import rope_config_validation
20
21
from transformers .models .siglip .configuration_siglip import SiglipVisionConfig
21
22
from transformers import Qwen2Config , WhisperConfig
22
23
from ...utils import logging
23
24
24
25
logger = logging .get_logger (__name__ )
26
+
27
+
25
28
class MiniCPMVSliceConfig (PretrainedConfig ):
26
29
model_type = "minicpmv"
27
30
@@ -39,9 +42,8 @@ def __init__(
39
42
40
43
@classmethod
41
44
def from_pretrained (cls , pretrained_model_name_or_path : Union [str , os .PathLike ], ** kwargs ) -> "PretrainedConfig" :
42
- cls ._set_token_in_kwargs (kwargs )
43
-
44
- config_dict , kwargs = cls .get_config_dict (pretrained_model_name_or_path , ** kwargs )
45
+ config_dict , kwargs = cls .get_config_dict (
46
+ pretrained_model_name_or_path , ** kwargs )
45
47
46
48
if config_dict .get ("model_type" ) == "minicpmv" :
47
49
config_dict = config_dict ["slice_config" ]
@@ -84,10 +86,6 @@ def __init__(
84
86
attn_implementation : str = "sdpa" ,
85
87
use_mlp : bool = True ,
86
88
aug_loss_weight : bool = True ,
87
- do_sample : bool = True ,
88
- top_p : float = 0.7 ,
89
- top_k : int = 20 ,
90
- repetition_penalty : float = 1.0 ,
91
89
** kwargs ,
92
90
):
93
91
super ().__init__ (** kwargs )
@@ -116,13 +114,9 @@ def __init__(
116
114
self .attn_implementation = attn_implementation
117
115
self .use_mlp = use_mlp
118
116
self .aug_loss_weight = aug_loss_weight
119
- self .do_sample = do_sample
120
- self .top_p = top_p
121
- self .top_k = top_k
122
- self .repetition_penalty = repetition_penalty
123
117
124
118
125
- class MiniCPM_o_2_6Config (Qwen2Config ):
119
+ class MiniCPM_o_2_6Config (PretrainedConfig ):
126
120
model_type = "minicpmo"
127
121
keys_to_ignore_at_inference = ["past_key_values" ]
128
122
@@ -136,6 +130,21 @@ class MiniCPM_o_2_6Config(Qwen2Config):
136
130
"patch_size" : 14 ,
137
131
}
138
132
133
+ base_model_tp_plan = {
134
+ "layers.*.self_attn.q_proj" : "colwise" ,
135
+ "layers.*.self_attn.k_proj" : "colwise" ,
136
+ "layers.*.self_attn.v_proj" : "colwise" ,
137
+ "layers.*.self_attn.o_proj" : "rowwise" ,
138
+ "layers.*.mlp.gate_proj" : "colwise" ,
139
+ "layers.*.mlp.up_proj" : "colwise" ,
140
+ "layers.*.mlp.down_proj" : "rowwise" ,
141
+ }
142
+ base_model_pp_plan = {
143
+ "embed_tokens" : (["input_ids" ], ["inputs_embeds" ]),
144
+ "layers" : (["hidden_states" , "attention_mask" ], ["hidden_states" ]),
145
+ "norm" : (["hidden_states" ], ["hidden_states" ]),
146
+ }
147
+
139
148
def __init__ (
140
149
self ,
141
150
use_cache = True ,
@@ -155,6 +164,24 @@ def __init__(
155
164
init_vision = True ,
156
165
init_audio = True ,
157
166
init_tts = True ,
167
+ vocab_size = 151936 ,
168
+ hidden_size = 4096 ,
169
+ intermediate_size = 22016 ,
170
+ num_hidden_layers = 32 ,
171
+ num_attention_heads = 32 ,
172
+ num_key_value_heads = 32 ,
173
+ hidden_act = "silu" ,
174
+ max_position_embeddings = 32768 ,
175
+ initializer_range = 0.02 ,
176
+ rms_norm_eps = 1e-6 ,
177
+ tie_word_embeddings = False ,
178
+ rope_theta = 10000.0 ,
179
+ rope_scaling = None ,
180
+ use_sliding_window = False ,
181
+ sliding_window = 4096 ,
182
+ max_window_layers = 28 ,
183
+ layer_types = None ,
184
+ attention_dropout = 0.0 ,
158
185
** kwargs ,
159
186
):
160
187
self .use_cache = use_cache
@@ -179,7 +206,8 @@ def __init__(
179
206
180
207
# same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
181
208
if vision_config is None :
182
- self .vision_config = SiglipVisionConfig (** self .default_vision_config )
209
+ self .vision_config = SiglipVisionConfig (
210
+ ** self .default_vision_config )
183
211
logger .info ("vision_config is None, using default vision config" )
184
212
elif isinstance (vision_config , dict ):
185
213
self .vision_config = SiglipVisionConfig (** vision_config )
@@ -203,7 +231,47 @@ def __init__(
203
231
204
232
self .patch_size = self .vision_config .patch_size
205
233
206
- super ().__init__ (** kwargs )
234
+ self .vocab_size = vocab_size
235
+ self .max_position_embeddings = max_position_embeddings
236
+ self .hidden_size = hidden_size
237
+ self .intermediate_size = intermediate_size
238
+ self .num_hidden_layers = num_hidden_layers
239
+ self .num_attention_heads = num_attention_heads
240
+ self .use_sliding_window = use_sliding_window
241
+ self .sliding_window = sliding_window if self .use_sliding_window else None
242
+ self .max_window_layers = max_window_layers
243
+
244
+ # for backward compatibility
245
+ if num_key_value_heads is None :
246
+ num_key_value_heads = num_attention_heads
247
+
248
+ self .num_key_value_heads = num_key_value_heads
249
+ self .hidden_act = hidden_act
250
+ self .initializer_range = initializer_range
251
+ self .rms_norm_eps = rms_norm_eps
252
+ self .rope_theta = rope_theta
253
+ self .rope_scaling = rope_scaling
254
+ self .attention_dropout = attention_dropout
255
+ # Validate the correctness of rotary position embeddings parameters
256
+ # BC: if there is a 'type' field, move it to 'rope_type'.
257
+ if self .rope_scaling is not None and "type" in self .rope_scaling :
258
+ self .rope_scaling ["rope_type" ] = self .rope_scaling ["type" ]
259
+ rope_config_validation (self )
260
+
261
+ self .layer_types = layer_types
262
+ if self .layer_types is None :
263
+ self .layer_types = [
264
+ "sliding_attention"
265
+ if self .sliding_window is not None and i >= self .max_window_layers
266
+ else "full_attention"
267
+ for i in range (self .num_hidden_layers )
268
+ ]
269
+ layer_type_validation (self .layer_types )
270
+
271
+ super ().__init__ (
272
+ tie_word_embeddings = tie_word_embeddings ,
273
+ ** kwargs ,
274
+ )
207
275
208
276
209
277
__all__ = ["MiniCPM_o_2_6Config" ]
0 commit comments