@@ -196,7 +196,7 @@ struct clip_hparams {
196196 int32_t n_wa_pattern = 0 ;
197197 int32_t spatial_merge_size = 0 ;
198198
199- std::vector<int32_t > deepstack_layers ; // qwen3vl deepstack layers
199+ std::vector<bool > is_deepstack_layers ; // qwen3vl: whether the layer is a deepstack layer
200200
201201 // audio
202202 int32_t n_mel_bins = 0 ; // whisper preprocessor
@@ -241,6 +241,14 @@ struct clip_layer {
241241 // layer scale (no bias)
242242 ggml_tensor * ls_1_w = nullptr ;
243243 ggml_tensor * ls_2_w = nullptr ;
244+
245+ // qwen3vl deepstack merger
246+ ggml_tensor * deepstack_norm_w = nullptr ;
247+ ggml_tensor * deepstack_norm_b = nullptr ;
248+ ggml_tensor * deepstack_fc1_w = nullptr ;
249+ ggml_tensor * deepstack_fc1_b = nullptr ;
250+ ggml_tensor * deepstack_fc2_w = nullptr ;
251+ ggml_tensor * deepstack_fc2_b = nullptr ;
244252};
245253
246254struct clip_model {
@@ -361,17 +369,6 @@ struct clip_model {
361369 ggml_tensor * mm_norm_pre_w = nullptr ;
362370 ggml_tensor * mm_norm_mid_w = nullptr ;
363371
364- // qwen3vl deepstack
365- struct deepstack_merger {
366- ggml_tensor * norm_w = nullptr ;
367- ggml_tensor * norm_b = nullptr ;
368- ggml_tensor * fc1_w = nullptr ;
369- ggml_tensor * fc1_b = nullptr ;
370- ggml_tensor * fc2_w = nullptr ;
371- ggml_tensor * fc2_b = nullptr ;
372- };
373- std::vector<deepstack_merger> deepstack_mergers;
374-
375372 bool audio_has_avgpool () const {
376373 return proj_type == PROJECTOR_TYPE_QWEN2A
377374 || proj_type == PROJECTOR_TYPE_VOXTRAL;
@@ -849,7 +846,6 @@ struct clip_graph {
849846 GGML_ASSERT (model.patch_bias != nullptr );
850847 GGML_ASSERT (model.position_embeddings != nullptr );
851848 GGML_ASSERT (model.class_embedding == nullptr );
852- GGML_ASSERT (!hparams.deepstack_layers .empty ());
853849
854850 const int batch_size = 1 ;
855851 const int n_pos = n_patches;
@@ -986,17 +982,13 @@ struct clip_graph {
986982 cur = ggml_add (ctx0, inpL, cur);
987983 cb (cur, " layer_out" , il);
988984
989- if (std::find (hparams.deepstack_layers .begin (), hparams.deepstack_layers .end (), il) != hparams.deepstack_layers .end ()) {
990- const int deepstack_idx = std::find (hparams.deepstack_layers .begin (), hparams.deepstack_layers .end (), il) - hparams.deepstack_layers .begin ();
991- auto & merger = model.deepstack_mergers [deepstack_idx];
992- ggml_tensor * feat = ggml_dup (ctx0, cur);
993- feat = ggml_reshape_3d (ctx0, feat, n_embd * merge_factor, n_pos / merge_factor, batch_size);
994-
995- feat = build_norm (feat, merger.norm_w , merger.norm_b , norm_t , eps, il);
985+ if (hparams.is_deepstack_layers [il]) {
986+ ggml_tensor * feat = ggml_reshape_3d (ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
987+ feat = build_norm (feat, layer.deepstack_norm_w , layer.deepstack_norm_b , norm_t , eps, il);
996988 feat = build_ffn (feat,
997- merger. fc1_w , merger. fc1_b ,
989+ layer. deepstack_fc1_w , layer. deepstack_fc1_b ,
998990 nullptr , nullptr ,
999- merger. fc2_w , merger. fc2_b ,
991+ layer. deepstack_fc2_w , layer. deepstack_fc2_b ,
1000992 ffn_op_type::FFN_GELU, il);
1001993
1002994 if (!deepstack_features) {
@@ -2571,6 +2563,9 @@ struct clip_model_loader {
25712563 hparams.vision_feature_layer .insert (layer);
25722564 }
25732565
2566+ // set default deepstack layers to false
2567+ hparams.is_deepstack_layers .resize (hparams.n_layer , false );
2568+
25742569 // model-specific params
25752570 switch (model.proj_type ) {
25762571 case PROJECTOR_TYPE_MINICPMV:
@@ -2632,7 +2627,7 @@ struct clip_model_loader {
26322627 hparams.image_size = 1024 ; // still need this?
26332628 hparams.warmup_image_size = hparams.patch_size * 8 ;
26342629 get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size , false );
2635- get_arr_int (KEY_DEEPSTACK_LAYERS , hparams.deepstack_layers , false );
2630+ get_arr_bool (KEY_IS_DEEPSTACK_LAYERS , hparams.is_deepstack_layers , false );
26362631 } break ;
26372632 case PROJECTOR_TYPE_LLAMA4:
26382633 {
@@ -2675,10 +2670,19 @@ struct clip_model_loader {
26752670 if (hparams.spatial_merge_size > 0 ) {
26762671 LOG_INF (" %s: spatial_merge_size: %d\n " , __func__, hparams.spatial_merge_size );
26772672 }
2678- if (!hparams.deepstack_layers .empty ()) {
2679- LOG_INF (" %s: deepstack_layers: " , __func__);
2680- for (size_t i = 0 ; i < hparams.deepstack_layers .size (); i++) {
2681- LOG_CNT (" %d%s" , hparams.deepstack_layers [i], i < hparams.deepstack_layers .size () - 1 ? " , " : " \n " );
2673+ if (!hparams.is_deepstack_layers .empty ()) {
2674+ LOG_INF (" %s: deepstack enabled layers: " , __func__);
2675+ bool first = true ;
2676+ for (size_t i = 0 ; i < hparams.is_deepstack_layers .size (); ++i) {
2677+ if (hparams.is_deepstack_layers [i]) {
2678+ LOG_CNT (" %s%zu" , first ? " " : " , " , i);
2679+ first = false ;
2680+ }
2681+ }
2682+ if (first) {
2683+ LOG_CNT (" none\n " );
2684+ } else {
2685+ LOG_CNT (" \n " );
26822686 }
26832687 }
26842688 } else if (is_audio) {
@@ -2778,6 +2782,17 @@ struct clip_model_loader {
27782782 layer.ff_down_w = get_tensor (string_format (TN_FFN_DOWN, prefix, il, " weight" ));
27792783 layer.ff_down_b = get_tensor (string_format (TN_FFN_DOWN, prefix, il, " bias" ), false );
27802784
2785+
2786+ // qwen3vl deepstack layer
2787+ if (hparams.is_deepstack_layers [il]) {
2788+ layer.deepstack_norm_w = get_tensor (string_format (TN_DEEPSTACK_NORM, il, " weight" ), false );
2789+ layer.deepstack_norm_b = get_tensor (string_format (TN_DEEPSTACK_NORM, il, " bias" ), false );
2790+ layer.deepstack_fc1_w = get_tensor (string_format (TN_DEEPSTACK_FC1, il, " weight" ), false );
2791+ layer.deepstack_fc1_b = get_tensor (string_format (TN_DEEPSTACK_FC1, il, " bias" ), false );
2792+ layer.deepstack_fc2_w = get_tensor (string_format (TN_DEEPSTACK_FC2, il, " weight" ), false );
2793+ layer.deepstack_fc2_b = get_tensor (string_format (TN_DEEPSTACK_FC2, il, " bias" ), false );
2794+ }
2795+
27812796 // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
27822797 // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
27832798 bool is_ffn_swapped = (
@@ -2919,19 +2934,6 @@ struct clip_model_loader {
29192934 model.mm_0_b = get_tensor (string_format (TN_LLAVA_PROJ, 0 , " bias" ));
29202935 model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " weight" ));
29212936 model.mm_1_b = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " bias" ));
2922-
2923- if (!hparams.deepstack_layers .empty ()) {
2924- model.deepstack_mergers .resize (hparams.deepstack_layers .size ());
2925- for (size_t i = 0 ; i < hparams.deepstack_layers .size (); i++) {
2926- auto & merger = model.deepstack_mergers [i];
2927- merger.norm_w = get_tensor (string_format (" v.deepstack.%d.norm.weight" , (int )i), false );
2928- merger.norm_b = get_tensor (string_format (" v.deepstack.%d.norm.bias" , (int )i), false );
2929- merger.fc1_w = get_tensor (string_format (" v.deepstack.%d.fc1.weight" , (int )i), false );
2930- merger.fc1_b = get_tensor (string_format (" v.deepstack.%d.fc1.bias" , (int )i), false );
2931- merger.fc2_w = get_tensor (string_format (" v.deepstack.%d.fc2.weight" , (int )i), false );
2932- merger.fc2_b = get_tensor (string_format (" v.deepstack.%d.fc2.bias" , (int )i), false );
2933- }
2934- }
29352937 } break ;
29362938 case PROJECTOR_TYPE_GEMMA3:
29372939 {
@@ -3139,6 +3141,21 @@ struct clip_model_loader {
31393141 }
31403142 }
31413143
3144+ void get_arr_bool (const std::string & key, std::vector<bool > & output, bool required = true ) {
3145+ const int i = gguf_find_key (ctx_gguf.get (), key.c_str ());
3146+ if (i < 0 ) {
3147+ if (required) throw std::runtime_error (" Key not found: " + key);
3148+ return ;
3149+ }
3150+
3151+ const int n = gguf_get_arr_n (ctx_gguf.get (), i);
3152+ output.resize (n);
3153+ const bool * values = (const bool *)gguf_get_arr_data (ctx_gguf.get (), i);
3154+ for (int i = 0 ; i < n; ++i) {
3155+ output[i] = values[i];
3156+ }
3157+ }
3158+
31423159 void set_llava_uhd_res_candidates (clip_model & model, const int max_patches_per_side) {
31433160 auto & hparams = model.hparams ;
31443161 for (int x = 1 ; x <= max_patches_per_side; x++) {
@@ -4632,7 +4649,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
46324649 case PROJECTOR_TYPE_QWEN25VL:
46334650 return ctx->model .mm_1_b ->ne [0 ];
46344651 case PROJECTOR_TYPE_QWEN3VL:
4635- return ctx->model .mm_1_b ->ne [0 ] * (( int ) ctx->model .hparams .deepstack_layers . size () + 1 ); // main path + deepstack paths
4652+ return ctx->model .mm_1_b ->ne [0 ] * (1 + std::count (ctx-> model . hparams . is_deepstack_layers . begin (), ctx->model .hparams .is_deepstack_layers . end (), true ) ); // main path + deepstack paths
46364653 case PROJECTOR_TYPE_GEMMA3:
46374654 return ctx->model .mm_input_proj_w ->ne [0 ];
46384655 case PROJECTOR_TYPE_IDEFICS3:
0 commit comments