@@ -329,7 +329,6 @@ struct clip_ctx {
329329 struct clip_vision_model vision_model;
330330 projector_type proj_type = PROJECTOR_TYPE_MLP;
331331
332- int32_t max_feature_layer; // unused in newer models like gemma3
333332 float image_mean[3 ];
334333 float image_std[3 ];
335334
@@ -867,6 +866,27 @@ struct clip_graph {
867866
868867 GGML_ASSERT (n_patches_x == n_patches_y && " only square images supported" );
869868
869+ // Calculate the deepest feature layer based on hparams and projector type
870+ int max_feature_layer = n_layer;
871+ {
872+ // Get the index of the second to last layer; this is the default for models that have a llava projector
873+ int il_last = hparams.n_layer - 1 ;
874+ int deepest_feature_layer = -1 ;
875+
876+ if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
877+ il_last += 1 ;
878+ }
879+
880+ // If we set explicit vision feature layers, only go up to the deepest one
881+ // NOTE: only used by granite-vision models for now
882+ for (const auto & feature_layer : hparams.vision_feature_layer ) {
883+ if (feature_layer > deepest_feature_layer) {
884+ deepest_feature_layer = feature_layer;
885+ }
886+ }
887+ max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
888+ }
889+
870890 ggml_tensor * inp = build_inp ();
871891
872892 if (model.patch_bias ) {
@@ -896,7 +916,7 @@ struct clip_graph {
896916 const auto & vision_feature_layer = hparams.vision_feature_layer ;
897917
898918 // loop over layers
899- for (int il = 0 ; il < ctx-> max_feature_layer ; il++) {
919+ for (int il = 0 ; il < max_feature_layer; il++) {
900920 auto & layer = model.layers [il];
901921 ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
902922
@@ -977,7 +997,7 @@ struct clip_graph {
977997 // process vision feature layers (used by granite)
978998 {
979999 // final layer is a vision feature layer
980- if (vision_feature_layer.find (ctx-> max_feature_layer ) != vision_feature_layer.end ()) {
1000+ if (vision_feature_layer.find (max_feature_layer) != vision_feature_layer.end ()) {
9811001 embedding_stack.push_back (inpL);
9821002 }
9831003
@@ -1760,30 +1780,6 @@ struct clip_model_loader {
17601780 hparams.vision_feature_layer .insert (layer);
17611781 }
17621782
1763- // Calculate the deepest feature layer based on hparams and projector type
1764- // NOTE: This is only used by build_graph_legacy()
1765- {
1766- // Get the index of the second to last layer; this is the default for models that have a llava projector
1767- int n_layer = hparams.n_layer - 1 ;
1768- int deepest_feature_layer = -1 ;
1769-
1770- if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
1771- || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
1772- || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
1773- || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
1774- n_layer += 1 ;
1775- }
1776-
1777- // If we set explicit vision feature layers, only go up to the deepest one
1778- // NOTE: only used by granite-vision models for now
1779- for (const auto & feature_layer : hparams.vision_feature_layer ) {
1780- if (feature_layer > deepest_feature_layer) {
1781- deepest_feature_layer = feature_layer;
1782- }
1783- }
1784- ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
1785- }
1786-
17871783 // model-specific params
17881784 switch (ctx_clip.proj_type ) {
17891785 case PROJECTOR_TYPE_MINICPMV:
0 commit comments