Skip to content

Commit b4f660f

Browse files
apply projector LayerNorm at runtime
1 parent f4af853 commit b4f660f

File tree

1 file changed

+16
-2
lines changed

1 file changed

+16
-2
lines changed

tools/mtmd/clip.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,7 +1113,7 @@ struct clip_graph {
11131113
}
11141114
}
11151115

1116-
// 2-layer MLP projector: mm.0 -> GELU -> mm.2
1116+
// 2-layer MLP projector: LayerNorm (mlp1.0) -> mm.0 -> GELU -> mm.2
11171117
ggml_tensor * embeddings = cur;
11181118

11191119
GGML_ASSERT(model.mm_0_w != nullptr);
@@ -1123,6 +1123,17 @@ struct clip_graph {
11231123
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
11241124
embeddings = ggml_cont_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
11251125

1126+
// Apply projector input LayerNorm (mlp1.0) with default eps = 1e-5
1127+
if (model.mm_input_norm_w || model.mm_input_norm_b) {
1128+
embeddings = build_norm(
1129+
embeddings,
1130+
model.mm_input_norm_w,
1131+
model.mm_input_norm_b,
1132+
NORM_TYPE_NORMAL,
1133+
1e-5f,
1134+
/*il=*/-1);
1135+
}
1136+
11261137
// Use shared FFN helper: Linear(mm.0) -> GELU -> Linear(mm.2)
11271138
embeddings = build_ffn(
11281139
embeddings,
@@ -3121,7 +3132,10 @@ struct clip_model_loader {
31213132
} break;
31223133
case PROJECTOR_TYPE_EAGLE2VL:
31233134
{
3124-
// 2-layer MLP projector using mm.0 and mm.2 (normalized at conversion time)
3135+
// projector input LayerNorm (mlp1.0.{weight,bias})
3136+
model.mm_input_norm_w = get_tensor("mm_input_norm_w", false);
3137+
model.mm_input_norm_b = get_tensor("mm_input_norm_b", false);
3138+
// 2-layer MLP projector using mm.0 and mm.2
31253139
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
31263140
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
31273141
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));

0 commit comments

Comments
 (0)