@@ -965,23 +965,30 @@ static ggml_cgraph * clip_image_build_graph_llama4(clip_ctx * ctx, const clip_im
965965 ggml_row_size (cur->type , hidden_size),
966966 ggml_row_size (cur->type , hidden_size * num_patches), 0 );
967967
968- cur = ggml_reshape_3d (ctx0, cur,
968+ cur = ggml_reshape_4d (ctx0, cur,
969969 hidden_size * scale_factor,
970- num_patches / scale_factor,
970+ px / scale_factor,
971+ py,
971972 batch_size);
972973 cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
973974
974975 cur = ggml_reshape_4d (ctx0, ggml_cont (ctx0, cur),
975976 hidden_size * scale_factor * scale_factor,
976- py / scale_factor,
977977 px / scale_factor,
978+ py / scale_factor,
978979 batch_size);
979980 cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
980981
982+ cur = ggml_reshape_3d (ctx0, ggml_cont (ctx0, cur),
983+ hidden_size * scale_factor * scale_factor,
984+ num_patches / scale_factor / scale_factor,
985+ batch_size);
986+
981987 // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
982988 cur = ggml_mul_mat (ctx0, model.mm_model_mlp_1_w , cur);
983989 cur = ggml_gelu (ctx0, cur);
984990 cur = ggml_mul_mat (ctx0, model.mm_model_mlp_2_w , cur);
991+ cur = ggml_gelu (ctx0, cur);
985992 embeddings = cur;
986993 }
987994
0 commit comments