Skip to content

Commit 186d7a8

Browse files
committed
add comment
1 parent e52481b commit 186d7a8

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

tools/mtmd/clip.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ struct clip_layer {
219219
ggml_tensor * ff_down_w = nullptr;
220220
ggml_tensor * ff_down_b = nullptr;
221221

222-
// layernorm 2 (post-attn norm / pre-ffn norm)
222+
// layernorm 2
223223
ggml_tensor * ln_2_w = nullptr;
224224
ggml_tensor * ln_2_b = nullptr;
225225

@@ -971,6 +971,9 @@ struct clip_graph {
971971

972972
// build ViT with 2D position embeddings
973973
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
974+
// first half is X axis and second half is Y axis
975+
// ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
976+
// ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
974977
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
975978
};
976979
ggml_tensor * cur = build_vit(
@@ -990,7 +993,7 @@ struct clip_graph {
990993
// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
991994
{
992995
const int scale_factor = model.hparams.proj_scale_factor;
993-
const int bsz = 1; // batch size, always 1 for now since we don't support batching
996+
const int bsz = 1; // batch size, always 1 for now since we don't support batching
994997
GGML_ASSERT(scale_factor > 0);
995998
GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
996999
cur = ggml_reshape_4d(ctx0, cur,

0 commit comments

Comments
 (0)