Skip to content

Commit 919318e

Browse files
committed
logits matched, but it still preceives the image incorrectly
1 parent 7d9d4e3 commit 919318e

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

tools/mtmd/clip.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ struct clip_graph {
445445
};
446446
ctx0_ptr.reset(ggml_init(params));
447447
ctx0 = ctx0_ptr.get();
448-
gf = ggml_new_graph(ctx0);
448+
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
449449
}
450450

451451
ggml_cgraph * build_siglip() {
@@ -965,14 +965,12 @@ struct clip_graph {
965965
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
966966
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
967967
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
968+
cb(inp, "patch_conv", -1);
968969
}
969970

970971
// add CLS token
971972
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
972973

973-
// add position embeddings
974-
inp = ggml_add(ctx0, inp, model.position_embeddings);
975-
976974
// build ViT with 2D position embeddings
977975
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
978976
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
@@ -1013,6 +1011,7 @@ struct clip_graph {
10131011
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
10141012
n_embd * scale_factor * scale_factor,
10151013
n_patches / scale_factor / scale_factor);
1014+
cb(cur, "pixel_shuffle", -1);
10161015
}
10171016

10181017
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
@@ -1021,8 +1020,13 @@ struct clip_graph {
10211020
cur = ggml_gelu(ctx0, cur);
10221021
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
10231022
cur = ggml_gelu(ctx0, cur);
1023+
cb(cur, "adapter_mlp", -1);
10241024
}
10251025

1026+
// Llama4MultiModalProjector
1027+
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
1028+
cb(cur, "projected", -1);
1029+
10261030
// build the graph
10271031
ggml_build_forward_expand(gf, cur);
10281032

@@ -1408,11 +1412,13 @@ struct clip_graph {
14081412
// utility functions
14091413
//
14101414

1411-
void cb(ggml_tensor * cur, const char * name, int il) const {
1415+
void cb(ggml_tensor * cur0, const char * name, int il) const {
14121416
if (ctx->debug_graph) {
1417+
ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
14131418
std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
14141419
ggml_set_name(cur, cur_name.c_str());
14151420
ggml_set_output(cur);
1421+
ggml_build_forward_expand(gf, cur);
14161422
ctx->debug_print_tensors.push_back(cur);
14171423
}
14181424
}

0 commit comments

Comments
 (0)