@@ -1113,7 +1113,7 @@ struct clip_graph {
11131113 }
11141114 }
11151115
1116- // 2-layer MLP projector: mm.0 -> GELU -> mm.2
1116+ // 2-layer MLP projector: LayerNorm (mlp1.0) -> mm.0 -> GELU -> mm.2
11171117 ggml_tensor * embeddings = cur;
11181118
11191119 GGML_ASSERT (model.mm_0_w != nullptr );
@@ -1123,6 +1123,17 @@ struct clip_graph {
11231123 embeddings = ggml_reshape_2d (ctx0, embeddings, embeddings->ne [0 ], embeddings->ne [1 ]);
11241124 embeddings = ggml_cont_2d (ctx0, embeddings, embeddings->ne [0 ], embeddings->ne [1 ]);
11251125
1126+ // Apply projector input LayerNorm (mlp1.0) with default eps = 1e-5
1127+ if (model.mm_input_norm_w || model.mm_input_norm_b ) {
1128+ embeddings = build_norm (
1129+ embeddings,
1130+ model.mm_input_norm_w ,
1131+ model.mm_input_norm_b ,
1132+ NORM_TYPE_NORMAL,
1133+ 1e-5f ,
1134+ /* il=*/ -1 );
1135+ }
1136+
11261137 // Use shared FFN helper: Linear(mm.0) -> GELU -> Linear(mm.2)
11271138 embeddings = build_ffn (
11281139 embeddings,
@@ -3121,7 +3132,10 @@ struct clip_model_loader {
31213132 } break ;
31223133 case PROJECTOR_TYPE_EAGLE2VL:
31233134 {
3124- // 2-layer MLP projector using mm.0 and mm.2 (normalized at conversion time)
3135+ // projector input LayerNorm (mlp1.0.{weight,bias})
3136+ model.mm_input_norm_w = get_tensor (" mm_input_norm_w" , false );
3137+ model.mm_input_norm_b = get_tensor (" mm_input_norm_b" , false );
3138+ // 2-layer MLP projector using mm.0 and mm.2
31253139 model.mm_0_w = get_tensor (string_format (TN_LLAVA_PROJ, 0 , " weight" ));
31263140 model.mm_0_b = get_tensor (string_format (TN_LLAVA_PROJ, 0 , " bias" ), false );
31273141 model.mm_2_w = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " weight" ));
0 commit comments