Skip to content

Commit b6e29c6

Browse files
authored
Merge pull request #31 from JJJYmmm/add_qwen3vl
Revert "optimize deepstack feature saving" for temporal fix
2 parents 85fd83a + 794481e commit b6e29c6

File tree

1 file changed

+11
-22
lines changed

1 file changed

+11
-22
lines changed

tools/mtmd/clip.cpp

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -846,12 +846,9 @@ struct clip_graph {
846846
GGML_ASSERT(model.patch_bias != nullptr);
847847
GGML_ASSERT(model.position_embeddings != nullptr);
848848
GGML_ASSERT(model.class_embedding == nullptr);
849-
GGML_ASSERT(hparams.spatial_merge_size == 2);
850849

851850
const int batch_size = 1;
852-
const int merge_factor = 4;
853851
const int n_pos = n_patches;
854-
const int n_pos_merged = n_pos / merge_factor;
855852
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
856853

857854
norm_type norm_t = NORM_TYPE_NORMAL;
@@ -914,23 +911,9 @@ struct clip_graph {
914911
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
915912
}
916913

917-
int deepstack_layer_idx = 1; // begin with 1 to jump main feature
918-
const int llm_n_embd = model.mm_1_w->ne[1]; // llm token dim
919-
const int n_deepstack_layers = std::count(hparams.is_deepstack_layers.begin(), hparams.is_deepstack_layers.end(), true);
920-
921-
const size_t element_size = ggml_type_size(inpL->type);
922-
const size_t slice_offsets = llm_n_embd * n_pos_merged * batch_size * element_size;
923-
924-
ggml_tensor * final_embedding = ggml_new_tensor_3d(ctx0, inpL->type,
925-
llm_n_embd * (n_deepstack_layers + 1), n_pos_merged, batch_size);
926-
927-
auto make_deepstack_slice = [&](int idx) {
928-
return ggml_view_3d(ctx0, final_embedding,
929-
llm_n_embd, n_pos_merged, batch_size,
930-
llm_n_embd * element_size,
931-
slice_offsets,
932-
idx * slice_offsets);
933-
};
914+
// deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
915+
ggml_tensor * deepstack_features = nullptr;
916+
const int merge_factor = hparams.spatial_merge_size > 0 ? hparams.spatial_merge_size * hparams.spatial_merge_size : 4; // default 2x2=4 for qwen3vl
934917

935918
// loop over layers
936919
for (int il = 0; il < n_layer; il++) {
@@ -1007,7 +990,13 @@ struct clip_graph {
1007990
nullptr, nullptr,
1008991
layer.deepstack_fc2_w, layer.deepstack_fc2_b,
1009992
ffn_op_type::FFN_GELU, il);
1010-
ggml_cpy(ctx0, feat, make_deepstack_slice(deepstack_layer_idx++));
993+
994+
if(!deepstack_features) {
995+
deepstack_features = feat;
996+
} else {
997+
// concat along the feature dimension
998+
deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
999+
}
10111000
}
10121001

10131002
inpL = cur;
@@ -1028,7 +1017,7 @@ struct clip_graph {
10281017
model.mm_1_w, model.mm_1_b,
10291018
ffn_op_type::FFN_GELU, -1);
10301019

1031-
ggml_cpy(ctx0, embeddings, make_deepstack_slice(0));
1020+
embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
10321021

10331022
// build the graph
10341023
ggml_build_forward_expand(gf, embeddings);

0 commit comments

Comments
 (0)