@@ -770,8 +770,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
770770
771771 // If this is an embedding feature layer, save the output.
772772 // NOTE: 0 index here refers to the input to the encoder.
773- for (int vf_layer_idx = 0 ; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx ++) {
774- if (il == ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx ]) {
773+ for (int vl_idx = 0 ; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [vl_idx] > 0 ); vl_idx ++) {
774+ if (il == ctx->vision_model .hparams .vision_feature_layer [vl_idx ]) {
775775 embedding_stack.push_back (embeddings);
776776 break ;
777777 }
@@ -875,8 +875,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
875875 }
876876
877877 // final layer is a vision feature layer
878- for (int vf_layer_idx = 0 ; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx ++) {
879- if (n_layer == ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx ]) {
878+ for (int vl_idx = 0 ; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [vl_idx] > 0 ); vl_idx ++) {
879+ if (n_layer == ctx->vision_model .hparams .vision_feature_layer [vl_idx ]) {
880880 embedding_stack.push_back (embeddings);
881881 break ;
882882 }
@@ -2991,7 +2991,8 @@ size_t get_max_image_grid_pinpoints() {
29912991int get_deepest_feature_layer (const struct clip_ctx * ctx) {
29922992 // Get the index of the second to last layer; this is the
29932993 // default for models that have a llava projector
2994- int n_layer = ctx->vision_model .hparams .n_layer - 1 ;
2994+ const auto & hparams = ctx->vision_model .hparams ;
2995+ int n_layer = hparams.n_layer - 1 ;
29952996 int deepest_feature_layer = -1 ;
29962997
29972998 // Handle other projectors; incrementing here indicates that we
@@ -3001,9 +3002,9 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
30013002 }
30023003
30033004 // If we set explicit vision feature layers, only go up to the deepest one
3004- for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS; i++) {
3005- if (ctx-> vision_model . hparams .vision_feature_layer [i] > deepest_feature_layer) {
3006- deepest_feature_layer = ctx-> vision_model . hparams .vision_feature_layer [i];
3005+ for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [i] > 0 ) ; i++) {
3006+ if (hparams.vision_feature_layer [i] > deepest_feature_layer) {
3007+ deepest_feature_layer = hparams.vision_feature_layer [i];
30073008 }
30083009 }
30093010 return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer;
0 commit comments