@@ -846,9 +846,12 @@ struct clip_graph {
846846 GGML_ASSERT (model.patch_bias != nullptr );
847847 GGML_ASSERT (model.position_embeddings != nullptr );
848848 GGML_ASSERT (model.class_embedding == nullptr );
849+ GGML_ASSERT (hparams.spatial_merge_size == 2 );
849850
850851 const int batch_size = 1 ;
852+ const int merge_factor = 4 ;
851853 const int n_pos = n_patches;
854+ const int n_pos_merged = n_pos / merge_factor;
852855 const int num_position_ids = n_pos * 4 ; // m-rope requires 4 dim per position
853856
854857 norm_type norm_t = NORM_TYPE_NORMAL;
@@ -911,9 +914,23 @@ struct clip_graph {
911914 inpL = build_norm (inpL, model.pre_ln_w , model.pre_ln_b , norm_t , eps, -1 );
912915 }
913916
914- // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
915- ggml_tensor * deepstack_features = nullptr ;
916- const int merge_factor = hparams.spatial_merge_size > 0 ? hparams.spatial_merge_size * hparams.spatial_merge_size : 4 ; // default 2x2=4 for qwen3vl
917+ int deepstack_layer_idx = 1 ; // begin with 1 to jump main feature
918+ const int llm_n_embd = model.mm_1_w ->ne [1 ]; // llm token dim
919+ const int n_deepstack_layers = std::count (hparams.is_deepstack_layers .begin (), hparams.is_deepstack_layers .end (), true );
920+
921+ const size_t element_size = ggml_type_size (inpL->type );
922+ const size_t slice_offsets = llm_n_embd * n_pos_merged * batch_size * element_size;
923+
924+ ggml_tensor * final_embedding = ggml_new_tensor_3d (ctx0, inpL->type ,
925+ llm_n_embd * (n_deepstack_layers + 1 ), n_pos_merged, batch_size);
926+
927+ auto make_deepstack_slice = [&](int idx) {
928+ return ggml_view_3d (ctx0, final_embedding,
929+ llm_n_embd, n_pos_merged, batch_size,
930+ llm_n_embd * element_size,
931+ slice_offsets,
932+ idx * slice_offsets);
933+ };
917934
918935 // loop over layers
919936 for (int il = 0 ; il < n_layer; il++) {
@@ -990,13 +1007,7 @@ struct clip_graph {
9901007 nullptr , nullptr ,
9911008 layer.deepstack_fc2_w , layer.deepstack_fc2_b ,
9921009 ffn_op_type::FFN_GELU, il);
993-
994- if (!deepstack_features) {
995- deepstack_features = feat;
996- } else {
997- // concat along the feature dimension
998- deepstack_features = ggml_concat (ctx0, deepstack_features, feat, 0 );
999- }
1010+ ggml_cpy (ctx0, feat, make_deepstack_slice (deepstack_layer_idx++));
10001011 }
10011012
10021013 inpL = cur;
@@ -1017,7 +1028,7 @@ struct clip_graph {
10171028 model.mm_1_w , model.mm_1_b ,
10181029 ffn_op_type::FFN_GELU, -1 );
10191030
1020- embeddings = ggml_concat (ctx0, embeddings, deepstack_features, 0 ); // concat along the feature dimension
1031+ ggml_cpy (ctx0, embeddings, make_deepstack_slice ( 0 ));
10211032
10221033 // build the graph
10231034 ggml_build_forward_expand (gf, embeddings);
0 commit comments