@@ -196,8 +196,6 @@ struct clip_hparams {
196196 int32_t n_wa_pattern = 0 ;
197197 int32_t spatial_merge_size = 0 ;
198198
199- std::vector<bool > is_deepstack_layers; // qwen3vl: whether the layer is a deepstack layer
200-
201199 // audio
202200 int32_t n_mel_bins = 0 ; // whisper preprocessor
203201 int32_t proj_stack_factor = 0 ; // ultravox
@@ -251,6 +249,10 @@ struct clip_layer {
251249 ggml_tensor * deepstack_fc1_b = nullptr ;
252250 ggml_tensor * deepstack_fc2_w = nullptr ;
253251 ggml_tensor * deepstack_fc2_b = nullptr ;
252+
253+ bool has_deepstack () const {
254+ return deepstack_fc1_w != nullptr ;
255+ }
254256};
255257
256258struct clip_model {
@@ -270,6 +272,8 @@ struct clip_model {
270272
271273 std::vector<clip_layer> layers;
272274
275+ int32_t n_deepstack_layers = 0 ; // used by Qwen3-VL, calculated from clip_layer
276+
273277 ggml_tensor * post_ln_w;
274278 ggml_tensor * post_ln_b;
275279
@@ -983,7 +987,7 @@ struct clip_graph {
983987 cur = ggml_add (ctx0, inpL, cur);
984988 cb (cur, " layer_out" , il);
985989
986- if (hparams. is_deepstack_layers [il] ) {
990+ if (layer. has_deepstack () ) {
987991 ggml_tensor * feat = ggml_reshape_3d (ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
988992 feat = build_norm (feat, layer.deepstack_norm_w , layer.deepstack_norm_b , norm_t , eps, il);
989993 feat = build_ffn (feat,
@@ -2565,9 +2569,6 @@ struct clip_model_loader {
25652569 hparams.vision_feature_layer .insert (layer);
25662570 }
25672571
2568- // set default deepstack layers to false
2569- hparams.is_deepstack_layers .resize (hparams.n_layer , false );
2570-
25712572 // model-specific params
25722573 switch (model.proj_type ) {
25732574 case PROJECTOR_TYPE_MINICPMV:
@@ -2630,7 +2631,6 @@ struct clip_model_loader {
26302631 hparams.image_size = 1024 ; // still need this?
26312632 hparams.warmup_image_size = hparams.patch_size * 8 ;
26322633 get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size , false );
2633- get_arr_bool (KEY_IS_DEEPSTACK_LAYERS, hparams.is_deepstack_layers , false );
26342634 } break ;
26352635 case PROJECTOR_TYPE_LLAMA4:
26362636 {
@@ -2673,21 +2673,6 @@ struct clip_model_loader {
26732673 if (hparams.spatial_merge_size > 0 ) {
26742674 LOG_INF (" %s: spatial_merge_size: %d\n " , __func__, hparams.spatial_merge_size );
26752675 }
2676- if (!hparams.is_deepstack_layers .empty ()) {
2677- LOG_INF (" %s: deepstack enabled layers: " , __func__);
2678- bool first = true ;
2679- for (size_t i = 0 ; i < hparams.is_deepstack_layers .size (); ++i) {
2680- if (hparams.is_deepstack_layers [i]) {
2681- LOG_CNT (" %s%zu" , first ? " " : " , " , i);
2682- first = false ;
2683- }
2684- }
2685- if (first) {
2686- LOG_CNT (" none\n " );
2687- } else {
2688- LOG_CNT (" \n " );
2689- }
2690- }
26912676 } else if (is_audio) {
26922677 LOG_INF (" \n --- audio hparams ---\n " );
26932678 LOG_INF (" %s: n_mel_bins: %d\n " , __func__, hparams.n_mel_bins );
@@ -2789,13 +2774,14 @@ struct clip_model_loader {
27892774
27902775
27912776 // qwen3vl deepstack layer
2792- if (hparams.is_deepstack_layers [il]) {
2793- layer.deepstack_norm_w = get_tensor (string_format (TN_DEEPSTACK_NORM, il, " weight" ), false );
2794- layer.deepstack_norm_b = get_tensor (string_format (TN_DEEPSTACK_NORM, il, " bias" ), false );
2795- layer.deepstack_fc1_w = get_tensor (string_format (TN_DEEPSTACK_FC1, il, " weight" ), false );
2796- layer.deepstack_fc1_b = get_tensor (string_format (TN_DEEPSTACK_FC1, il, " bias" ), false );
2797- layer.deepstack_fc2_w = get_tensor (string_format (TN_DEEPSTACK_FC2, il, " weight" ), false );
2798- layer.deepstack_fc2_b = get_tensor (string_format (TN_DEEPSTACK_FC2, il, " bias" ), false );
2777+ layer.deepstack_norm_w = get_tensor (string_format (TN_DEEPSTACK_NORM, il, " weight" ), false );
2778+ layer.deepstack_norm_b = get_tensor (string_format (TN_DEEPSTACK_NORM, il, " bias" ), false );
2779+ layer.deepstack_fc1_w = get_tensor (string_format (TN_DEEPSTACK_FC1, il, " weight" ), false );
2780+ layer.deepstack_fc1_b = get_tensor (string_format (TN_DEEPSTACK_FC1, il, " bias" ), false );
2781+ layer.deepstack_fc2_w = get_tensor (string_format (TN_DEEPSTACK_FC2, il, " weight" ), false );
2782+ layer.deepstack_fc2_b = get_tensor (string_format (TN_DEEPSTACK_FC2, il, " bias" ), false );
2783+ if (layer.has_deepstack ()) {
2784+ model.n_deepstack_layers ++;
27992785 }
28002786
28012787 // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
@@ -3155,21 +3141,6 @@ struct clip_model_loader {
31553141 }
31563142 }
31573143
3158- void get_arr_bool (const std::string & key, std::vector<bool > & output, bool required = true ) {
3159- const int i = gguf_find_key (ctx_gguf.get (), key.c_str ());
3160- if (i < 0 ) {
3161- if (required) throw std::runtime_error (" Key not found: " + key);
3162- return ;
3163- }
3164-
3165- const int n = gguf_get_arr_n (ctx_gguf.get (), i);
3166- output.resize (n);
3167- const bool * values = (const bool *)gguf_get_arr_data (ctx_gguf.get (), i);
3168- for (int i = 0 ; i < n; ++i) {
3169- output[i] = values[i];
3170- }
3171- }
3172-
31733144 void set_llava_uhd_res_candidates (clip_model & model, const int max_patches_per_side) {
31743145 auto & hparams = model.hparams ;
31753146 for (int x = 1 ; x <= max_patches_per_side; x++) {
@@ -4676,7 +4647,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
46764647 case PROJECTOR_TYPE_QWEN25VL:
46774648 return ctx->model .mm_1_b ->ne [0 ];
46784649 case PROJECTOR_TYPE_QWEN3VL:
4679- return ctx->model .mm_1_b ->ne [0 ] * (1 + std::count (ctx->model .hparams .is_deepstack_layers .begin (), ctx->model .hparams .is_deepstack_layers .end (), true )); // main path + deepstack paths
4650+ // main path + deepstack paths
4651+ return ctx->model .mm_1_b ->ne [0 ] * (1 + ctx->model .n_deepstack_layers );
46804652 case PROJECTOR_TYPE_GEMMA3:
46814653 return ctx->model .mm_input_proj_w ->ne [0 ];
46824654 case PROJECTOR_TYPE_IDEFICS3:
0 commit comments