@@ -770,8 +770,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
770770
771771        //  If this is an embedding feature layer, save the output.
772772        //  NOTE: 0 index here refers to the input to the encoder.
773-         for  (int  vf_layer_idx  = 0 ; vf_layer_idx  < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx ++) {
774-             if  (il == ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx ]) {
773+         for  (int  vl_idx  = 0 ; vl_idx  < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [vl_idx] >  0 ); vl_idx ++) {
774+             if  (il == ctx->vision_model .hparams .vision_feature_layer [vl_idx ]) {
775775                embedding_stack.push_back (embeddings);
776776                break ;
777777            }
@@ -875,8 +875,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
875875    }
876876
877877    //  final layer is a vision feature layer
878-     for  (int  vf_layer_idx  = 0 ; vf_layer_idx  < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx ++) {
879-         if  (n_layer == ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx ]) {
878+     for  (int  vl_idx  = 0 ; vl_idx  < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [vl_idx] >  0 ); vl_idx ++) {
879+         if  (n_layer == ctx->vision_model .hparams .vision_feature_layer [vl_idx ]) {
880880            embedding_stack.push_back (embeddings);
881881            break ;
882882        }
@@ -2991,7 +2991,8 @@ size_t get_max_image_grid_pinpoints() {
29912991int  get_deepest_feature_layer (const  struct  clip_ctx  * ctx) {
29922992    //  Get the index of the second to last layer; this is the
29932993    //  default for models that have a llava projector
2994-     int  n_layer = ctx->vision_model .hparams .n_layer  - 1 ;
2994+     const  auto  & hparams = ctx->vision_model .hparams ;
2995+     int  n_layer = hparams.n_layer  - 1 ;
29952996    int  deepest_feature_layer = -1 ;
29962997
29972998    //  Handle other projectors; incrementing here indicates that we
@@ -3001,9 +3002,9 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
30013002    }
30023003
30033004    //  If we set explicit vision feature layers, only go up to the deepest one
3004-     for  (int  i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS; i++) {
3005-         if  (ctx-> vision_model . hparams .vision_feature_layer [i] > deepest_feature_layer) {
3006-             deepest_feature_layer = ctx-> vision_model . hparams .vision_feature_layer [i];
3005+     for  (int  i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [i] >  0 ) ; i++) {
3006+         if  (hparams.vision_feature_layer [i] > deepest_feature_layer) {
3007+             deepest_feature_layer = hparams.vision_feature_layer [i];
30073008        }
30083009    }
30093010    return  deepest_feature_layer < 0  ? n_layer: deepest_feature_layer;
0 commit comments