@@ -330,8 +330,6 @@ struct clip_ctx {
330330 float image_std[3 ];
331331 bool use_gelu = false ;
332332 bool use_silu = false ;
333- bool use_glu_mlp = false ;
334- bool use_rms_norm = false ;
335333 int32_t ftype = 1 ;
336334
337335 gguf_context_ptr ctx_gguf;
@@ -847,7 +845,6 @@ static ggml_cgraph * clip_image_build_graph_qwen2_5_vl(clip_ctx * ctx, const cli
847845 inp = ggml_add (ctx0, inp, model.patch_bias );
848846 }
849847 struct ggml_tensor * embeddings = inp;
850- struct ggml_tensor * pos_embed = nullptr ;
851848 struct ggml_tensor * window_mask = nullptr ;
852849 struct ggml_tensor * window_idx = nullptr ;
853850 struct ggml_tensor * inv_window_idx = nullptr ;
@@ -858,17 +855,10 @@ static ggml_cgraph * clip_image_build_graph_qwen2_5_vl(clip_ctx * ctx, const cli
858855
859856 // pre-layernorm
860857 if (model.pre_ln_w ) {
861- if (ctx->use_rms_norm ) {
862- embeddings = ggml_rms_norm (ctx0, embeddings, eps);
863- ggml_set_name (embeddings, " pre_ln" );
864-
865- embeddings = ggml_mul (ctx0, embeddings, model.pre_ln_w );
866- } else {
867- embeddings = ggml_norm (ctx0, embeddings, eps);
868- ggml_set_name (embeddings, " pre_ln" );
858+ embeddings = ggml_rms_norm (ctx0, embeddings, eps);
859+ ggml_set_name (embeddings, " pre_ln" );
869860
870- embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_ln_w ), model.pre_ln_b );
871- }
861+ embeddings = ggml_mul (ctx0, embeddings, model.pre_ln_w );
872862 }
873863
874864 std::vector<struct ggml_tensor *> embedding_stack;
@@ -991,17 +981,10 @@ static ggml_cgraph * clip_image_build_graph_qwen2_5_vl(clip_ctx * ctx, const cli
991981
992982 // post-layernorm
993983 if (model.post_ln_w ) {
994- if (ctx->use_rms_norm ) {
995- embeddings = ggml_rms_norm (ctx0, embeddings, eps);
996- ggml_set_name (embeddings, " post_ln" );
997-
998- embeddings = ggml_mul (ctx0, embeddings, model.post_ln_w );
999- } else {
1000- embeddings = ggml_norm (ctx0, embeddings, eps);
1001- ggml_set_name (embeddings, " post_ln" );
984+ embeddings = ggml_rms_norm (ctx0, embeddings, eps);
985+ ggml_set_name (embeddings, " post_ln" );
1002986
1003- embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_ln_w ), model.post_ln_b );
1004- }
987+ embeddings = ggml_mul (ctx0, embeddings, model.post_ln_w );
1005988 }
1006989
1007990 // final layer is a vision feature layer
@@ -1086,7 +1069,6 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
10861069 const int n_head = hparams.n_head ;
10871070 const int d_head = hidden_size / n_head;
10881071 const float eps = hparams.eps ;
1089- const bool use_window_attn = hparams.full_attn_layers .size () > 0 ;
10901072 int mrope_sections[4 ] = {d_head/4 , d_head/4 , d_head/4 , d_head/4 };
10911073
10921074 const int batch_size = imgs.entries .size ();
@@ -1118,7 +1100,6 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
11181100
11191101 auto inp_1 = ggml_conv_2d (ctx0, model.patch_embeddings_1 , inp_raw, patch_size, patch_size, 0 , 0 , 1 , 1 );
11201102 inp = ggml_add (ctx0, inp, inp_1);
1121-
11221103 inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 1 , 2 , 0 , 3 )); // [w, h, c, b] -> [c, w, h, b]
11231104 inp = ggml_reshape_4d (
11241105 ctx0, inp,
@@ -1140,11 +1121,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
11401121 // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
11411122 inp = ggml_add (ctx0, inp, model.patch_bias );
11421123 }
1143- struct ggml_tensor * embeddings = inp;
1144- struct ggml_tensor * pos_embed = nullptr ;
1145- struct ggml_tensor * window_mask = nullptr ;
1146- struct ggml_tensor * window_idx = nullptr ;
1147- struct ggml_tensor * inv_window_idx = nullptr ;
1124+ struct ggml_tensor * embeddings = inp;
1125+ struct ggml_tensor * pos_embed = nullptr ;
11481126
11491127 if (ctx->has_llava_projector ) {
11501128 // concat class_embeddings and patch_embeddings
@@ -1186,40 +1164,16 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
11861164
11871165 // pre-layernorm
11881166 if (model.pre_ln_w ) {
1189- if (ctx->use_rms_norm ) {
1190- embeddings = ggml_rms_norm (ctx0, embeddings, eps);
1191- ggml_set_name (embeddings, " pre_ln" );
1192-
1193- embeddings = ggml_mul (ctx0, embeddings, model.pre_ln_w );
1194- } else {
1195- embeddings = ggml_norm (ctx0, embeddings, eps);
1196- ggml_set_name (embeddings, " pre_ln" );
1167+ embeddings = ggml_norm (ctx0, embeddings, eps);
1168+ ggml_set_name (embeddings, " pre_ln" );
11971169
1198- embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_ln_w ), model.pre_ln_b );
1199- }
1170+ embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_ln_w ), model.pre_ln_b );
12001171 }
12011172
12021173 std::vector<struct ggml_tensor *> embedding_stack;
12031174 const auto & vision_feature_layer = hparams.vision_feature_layer ;
12041175
12051176 // loop over layers
1206-
1207- if (use_window_attn) {
1208- inv_window_idx = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, num_positions / 4 );
1209- ggml_set_name (inv_window_idx, " inv_window_idx" );
1210- ggml_set_input (inv_window_idx);
1211- // mask for window attention
1212- window_mask = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, num_positions, num_positions);
1213- ggml_set_name (window_mask, " window_mask" );
1214- ggml_set_input (window_mask);
1215-
1216- // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
1217- GGML_ASSERT (batch_size == 1 );
1218- embeddings = ggml_reshape_2d (ctx0, embeddings, hidden_size * 4 , patches_w * patches_h * batch_size / 4 );
1219- embeddings = ggml_get_rows (ctx0, embeddings, inv_window_idx);
1220- embeddings = ggml_reshape_3d (ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
1221- }
1222-
12231177 for (int il = 0 ; il < ctx->max_feature_layer ; il++) {
12241178 struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
12251179
@@ -1232,12 +1186,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
12321186 // const size_t nb_q_w = model.layers[il].q_w->nb[0];
12331187
12341188 // layernorm1
1235- if (ctx->use_rms_norm ) {
1236- cur = ggml_rms_norm (ctx0, cur, eps);
1237- cur = ggml_mul (ctx0, cur, model.layers [il].ln_1_w );
1238- }
1239- else {
1189+ {
12401190 cur = ggml_norm (ctx0, cur, eps);
1191+
12411192 cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.layers [il].ln_1_w ),
12421193 model.layers [il].ln_1_b );
12431194 }
@@ -1277,14 +1228,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
12771228 V = ggml_reshape_3d (ctx0, V, num_positions, d_head, n_head * batch_size);
12781229
12791230 struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
1280- const bool inlist = std::find (hparams.full_attn_layers .begin (), hparams.full_attn_layers .end (), il) != hparams.full_attn_layers .end ();
1281- const bool full_attn = use_window_attn ? inlist : true ;
1282- if (full_attn) {
1283- KQ = ggml_soft_max_ext (ctx0, KQ, nullptr , 1 .0f / sqrtf ((float )d_head), 0 .0f );
1284- } else {
1285- KQ = ggml_soft_max_ext (ctx0, KQ, window_mask, 1 .0f / sqrtf ((float )d_head), 0 .0f );
1286- }
1287-
1231+ KQ = ggml_soft_max_ext (ctx0, KQ, nullptr , 1 .0f / sqrtf ((float )d_head), 0 .0f );
12881232 struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ);
12891233 KQV = ggml_reshape_4d (ctx0, KQV, d_head, num_positions, n_head, batch_size);
12901234 KQV = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
@@ -1301,50 +1245,25 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
13011245 embeddings = cur; // embeddings = residual, cur = hidden_states
13021246
13031247 // layernorm2
1304- if (ctx->use_rms_norm ) {
1305- cur = ggml_rms_norm (ctx0, cur, eps);
1306- cur = ggml_mul (ctx0, cur, model.layers [il].ln_2_w );
1307- } else {
1248+ {
13081249 cur = ggml_norm (ctx0, cur, eps);
1250+
13091251 cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.layers [il].ln_2_w ), model.layers [il].ln_2_b );
13101252 }
13111253
1312- // mlp
1313- if (ctx->use_glu_mlp ) {
1314- // ffn_up
1315- auto cur_up = ggml_mul_mat (ctx0, model.layers [il].ff_o_w , cur);
1316- cur_up = ggml_add (ctx0, cur_up, model.layers [il].ff_o_b );
1317-
1318- auto cur_gate = ggml_mul_mat (ctx0, model.layers [il].ff_g_w , cur);
1319- cur_gate = ggml_add (ctx0, cur_gate, model.layers [il].ff_g_b );
1320- if (ctx->use_gelu ) {
1321- cur_gate = ggml_gelu_inplace (ctx0, cur_gate);
1322- } else if (ctx->use_silu ) {
1323- cur_gate = ggml_silu_inplace (ctx0, cur_gate);
1324- } else {
1325- cur_gate = ggml_gelu_quick_inplace (ctx0, cur_gate);
1326- }
1327- cur = ggml_mul (ctx0, cur_gate, cur_up);
1254+ cur = ggml_mul_mat (ctx0, model.layers [il].ff_i_w , cur);
1255+ cur = ggml_add (ctx0, cur, model.layers [il].ff_i_b );
13281256
1329- // ffn_down
1330- cur = ggml_mul_mat (ctx0, model.layers [il].ff_i_w , cur);
1331- cur = ggml_add (ctx0, cur, model.layers [il].ff_i_b );
1257+ if (ctx->use_gelu ) {
1258+ cur = ggml_gelu_inplace (ctx0, cur);
1259+ } else if (ctx->use_silu ) {
1260+ cur = ggml_silu_inplace (ctx0, cur);
1261+ } else {
1262+ cur = ggml_gelu_quick_inplace (ctx0, cur);
13321263 }
1333- else {
1334- cur = ggml_mul_mat (ctx0, model.layers [il].ff_i_w , cur);
1335- cur = ggml_add (ctx0, cur, model.layers [il].ff_i_b );
1336-
1337- if (ctx->use_gelu ) {
1338- cur = ggml_gelu_inplace (ctx0, cur);
1339- } else if (ctx->use_silu ) {
1340- cur = ggml_silu_inplace (ctx0, cur);
1341- } else {
1342- cur = ggml_gelu_quick_inplace (ctx0, cur);
1343- }
13441264
1345- cur = ggml_mul_mat (ctx0, model.layers [il].ff_o_w , cur);
1346- cur = ggml_add (ctx0, cur, model.layers [il].ff_o_b );
1347- }
1265+ cur = ggml_mul_mat (ctx0, model.layers [il].ff_o_w , cur);
1266+ cur = ggml_add (ctx0, cur, model.layers [il].ff_o_b );
13481267
13491268 // residual 2
13501269 cur = ggml_add (ctx0, embeddings, cur);
@@ -1354,17 +1273,10 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
13541273
13551274 // post-layernorm
13561275 if (model.post_ln_w ) {
1357- if (ctx->use_rms_norm ) {
1358- embeddings = ggml_rms_norm (ctx0, embeddings, eps);
1359- ggml_set_name (embeddings, " post_ln" );
1360-
1361- embeddings = ggml_mul (ctx0, embeddings, model.post_ln_w );
1362- } else {
1363- embeddings = ggml_norm (ctx0, embeddings, eps);
1364- ggml_set_name (embeddings, " post_ln" );
1276+ embeddings = ggml_norm (ctx0, embeddings, eps);
1277+ ggml_set_name (embeddings, " post_ln" );
13651278
1366- embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_ln_w ), model.post_ln_b );
1367- }
1279+ embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_ln_w ), model.post_ln_b );
13681280 }
13691281
13701282 // final layer is a vision feature layer
@@ -1678,18 +1590,6 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
16781590 embeddings = ggml_add (ctx0, embeddings, model.mm_1_b );
16791591 }
16801592
1681- if (use_window_attn) {
1682- window_idx = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, num_positions / 4 );
1683- ggml_set_name (window_idx, " window_idx" );
1684- ggml_set_input (window_idx);
1685-
1686- // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
1687- GGML_ASSERT (batch_size == 1 );
1688- embeddings = ggml_reshape_2d (ctx0, embeddings, hparams.projection_dim , patches_w * patches_h / 4 );
1689- embeddings = ggml_get_rows (ctx0, embeddings, window_idx);
1690- embeddings = ggml_reshape_3d (ctx0, embeddings, hparams.projection_dim , patches_w * patches_h / 4 , batch_size);
1691- }
1692-
16931593 // build the graph
16941594 ggml_build_forward_expand (gf, embeddings);
16951595
@@ -1810,8 +1710,6 @@ struct clip_model_loader {
18101710
18111711 get_bool (KEY_USE_GELU, ctx_clip.use_gelu , false );
18121712 get_bool (KEY_USE_SILU, ctx_clip.use_silu , false );
1813- get_bool (KEY_USE_GLU_MLP, ctx_clip.use_glu_mlp , false );
1814- get_bool (KEY_USE_RMS_NORM, ctx_clip.use_rms_norm , false );
18151713
18161714 get_u32 (string_format (KEY_N_EMBD, " vision" ), hparams.hidden_size );
18171715 get_u32 (string_format (KEY_N_HEAD, " vision" ), hparams.n_head );
0 commit comments