@@ -754,22 +754,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
754754 embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_ln_w ), model.pre_ln_b );
755755 }
756756
757- // Check to see we have 1+ set vision feature layers set; otherwise it's the last layer
758757 std::vector<struct ggml_tensor *> embedding_stack;
759- bool has_feature_layers = ctx->vision_model .hparams .vision_feature_layer [0 ] > 0 ;
760- // Determine how many encoder layers we need to process; if we have explicit vision feature
761- // layers, only process what we need, otherwise process all of the visual encoder layers.
762- int max_feature_layer = -1 ;
763- if (has_feature_layers) {
764- for (int vf_layer_idx = 0 ; vf_layer_idx < 4 ; vf_layer_idx++) {
765- if (ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx] > max_feature_layer) {
766- max_feature_layer = ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx];
767- }
768- }
769- }
770- if (max_feature_layer < 0 ) {
771- max_feature_layer = n_layer;
772- }
758+ // Check to see we have 1+ set vision feature layers set; otherwise it's determined
759+ // by the type of projector that this model has (usually last or second to last layer).
760+ int max_feature_layer = get_deepest_feature_layer (ctx);
773761
774762 // loop over layers
775763 for (int il = 0 ; il < max_feature_layer; il++) {
@@ -890,7 +878,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
890878 }
891879
892880 // If feature layers are explicitly set, stack them (if we have multiple)
893- if (has_feature_layers && embedding_stack.size () > 0 ) {
881+ if (embedding_stack.size () > 0 ) {
894882 embeddings = embedding_stack.at (0 );
895883 for (unsigned long i=1 ; i < embedding_stack.size (); i++) {
896884 embeddings = ggml_concat (ctx0, embeddings, embedding_stack.at (i), 0 );
@@ -2990,6 +2978,27 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
29902978 return ctx->has_qwen2vl_merger ;
29912979}
29922980
2981+ // Determine the number of encoder layers to iterate over
2982+ CLIP_API int get_deepest_feature_layer (const struct clip_ctx * ctx) {
2983+ // Get the index of the second to last layer; this is the
2984+ // default for models that have a llava projector
2985+ int n_layer = ctx->vision_model .hparams .n_layer - 1 ;
2986+ int deepest_feature_layer = -1 ;
2987+
2988+ // Handle other projectors; incrementing here indicates that we
2989+ // should use the last encoder layer for the vision features.
2990+ if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger ) {
2991+ n_layer += 1 ;
2992+ }
2993+
2994+ // If we set explicit vision feature layers, only go up to the deepest one
2995+ for (int i = 0 ; i < 4 ; i++) {
2996+ if (ctx->vision_model .hparams .vision_feature_layer [i] > deepest_feature_layer) {
2997+ deepest_feature_layer = ctx->vision_model .hparams .vision_feature_layer [i];
2998+ }
2999+ }
3000+ return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer;
3001+ }
29933002
29943003bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
29953004 clip_image_f32 clip_img;
0 commit comments