@@ -587,6 +587,7 @@ struct clip_ctx {
587587 struct clip_vision_model vision_model;
588588 projector_type proj_type = PROJECTOR_TYPE_MLP;
589589
590+ int32_t max_feature_layer;
590591 float image_mean[3 ];
591592 float image_std[3 ];
592593 bool use_gelu = false ;
@@ -755,12 +756,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
755756 }
756757
757758 std::vector<struct ggml_tensor *> embedding_stack;
758- // Check to see if we have 1+ set vision feature layers set; otherwise it's determined
759- // by the type of projector that this model has (usually last or second to last layer).
760- int max_feature_layer = get_deepest_feature_layer (ctx);
761759
762760 // loop over layers
763- for (int il = 0 ; il < max_feature_layer; il++) {
761+ for (int il = 0 ; il < ctx-> max_feature_layer ; il++) {
764762 struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
765763
766764 // If this is an embedding feature layer, save the output.
@@ -862,7 +860,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
862860 }
863861
864862 // post-layernorm
865- if (ctx->has_post_norm && max_feature_layer == n_layer) {
863+ if (ctx->has_post_norm && ctx-> max_feature_layer == n_layer) {
866864 embeddings = ggml_norm (ctx0, embeddings, eps);
867865 ggml_set_name (embeddings, " post_ln" );
868866
@@ -1516,6 +1514,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
15161514 new_clip->image_std [i] = std_data[i];
15171515 }
15181516
1517+ // Calculate the deepest feature layer based on hparams and projector type
1518+ new_clip->max_feature_layer = get_deepest_feature_layer (new_clip);
1519+
15191520 if (verbosity >= 2 ) {
15201521 LOG_INF (" \n %s: vision model hparams\n " , __func__);
15211522 LOG_INF (" image_size %d\n " , hparams.image_size );
0 commit comments