@@ -219,7 +219,7 @@ struct clip_layer {
219219 ggml_tensor * ff_down_w = nullptr ;
220220 ggml_tensor * ff_down_b = nullptr ;
221221
222- // layernorm 2 (post-attn norm / pre-ffn norm)
222+ // layernorm 2
223223 ggml_tensor * ln_2_w = nullptr ;
224224 ggml_tensor * ln_2_b = nullptr ;
225225
@@ -971,6 +971,9 @@ struct clip_graph {
971971
972972 // build ViT with 2D position embeddings
973973 auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
974+ // first half is X axis and second half is Y axis
975+ // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
976+ // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
974977 return build_rope_2d (ctx0, cur, pos_w, pos_h, hparams.rope_theta , false );
975978 };
976979 ggml_tensor * cur = build_vit (
@@ -990,7 +993,7 @@ struct clip_graph {
990993 // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
991994 {
992995 const int scale_factor = model.hparams .proj_scale_factor ;
993- const int bsz = 1 ; // batch size, always 1 for now since we don't support batching
996+ const int bsz = 1 ; // batch size, always 1 for now since we don't support batching
994997 GGML_ASSERT (scale_factor > 0 );
995998 GGML_ASSERT (n_patches_x == n_patches_y); // llama4 only supports square images
996999 cur = ggml_reshape_4d (ctx0, cur,
0 commit comments