4040#include < map>
4141#include < regex>
4242#include < stdexcept>
43+ #include < unordered_set>
4344#include < vector>
4445#include < sstream>
4546#include < cinttypes>
@@ -120,6 +121,7 @@ static std::string format(const char * fmt, ...) {
120121#define KEY_IMAGE_MEAN " clip.vision.image_mean"
121122#define KEY_IMAGE_STD " clip.vision.image_std"
122123#define KEY_PROJ_TYPE " clip.projector_type"
124+ #define KEY_FEATURE_LAYER " clip.vision.feature_layer"
123125
124126#define KEY_MM_PATCH_MERGE_TYPE " clip.vision.mm_patch_merge_type"
125127#define KEY_IMAGE_GRID_PINPOINTS " clip.vision.image_grid_pinpoints"
@@ -444,8 +446,9 @@ struct clip_hparams {
444446
445447 char mm_patch_merge_type[32 ] = " flat" ; // spatial_unpad or flat (default)
446448
447- int32_t image_grid_pinpoints[ 32 ] ;
449+ std::vector< int32_t > image_grid_pinpoints;
448450 int32_t image_crop_resolution;
451+ std::unordered_set<int32_t > vision_feature_layer;
449452};
450453
451454struct clip_layer {
@@ -585,6 +588,7 @@ struct clip_ctx {
585588 struct clip_vision_model vision_model;
586589 projector_type proj_type = PROJECTOR_TYPE_MLP;
587590
591+ int32_t max_feature_layer;
588592 float image_mean[3 ];
589593 float image_std[3 ];
590594 bool use_gelu = false ;
@@ -651,7 +655,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
651655 const int hidden_size = hparams.hidden_size ;
652656 const int n_head = hparams.n_head ;
653657 const int d_head = hidden_size / n_head;
654- int n_layer = hparams.n_layer ;
655658 const float eps = hparams.eps ;
656659 int mrope_sections[4 ] = {d_head/4 , d_head/4 , d_head/4 , d_head/4 };
657660
@@ -752,13 +755,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
752755 embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_ln_w ), model.pre_ln_b );
753756 }
754757
758+ std::vector<struct ggml_tensor *> embedding_stack;
759+ const auto & vision_feature_layer = hparams.vision_feature_layer ;
760+
755761 // loop over layers
756- if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger ) {
757- n_layer += 1 ;
758- }
759- for (int il = 0 ; il < n_layer - 1 ; il++) {
762+ for (int il = 0 ; il < ctx->max_feature_layer ; il++) {
760763 struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
761764
765+ // If this is an embedding feature layer, save the output.
766+ // NOTE: 0 index here refers to the input to the encoder.
767+ if (vision_feature_layer.find (il) != vision_feature_layer.end ()) {
768+ embedding_stack.push_back (embeddings);
769+ }
770+
762771 // const size_t nb_q_w = model.layers[il].q_w->nb[0];
763772
764773 // layernorm1
@@ -846,7 +855,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
846855 cur = ggml_add (ctx0, embeddings, cur);
847856
848857 embeddings = cur;
849-
850858 }
851859
852860 // post-layernorm
@@ -857,6 +865,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
857865 embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_ln_w ), model.post_ln_b );
858866 }
859867
868+ // final layer is a vision feature layer
869+ if (vision_feature_layer.find (ctx->max_feature_layer ) != vision_feature_layer.end ()) {
870+ embedding_stack.push_back (embeddings);
871+ }
872+
873+ // If feature layers are explicitly set, stack them (if we have multiple)
874+ if (!embedding_stack.empty ()) {
875+ embeddings = embedding_stack[0 ];
876+ for (size_t i = 1 ; i < embedding_stack.size (); i++) {
877+ embeddings = ggml_concat (ctx0, embeddings, embedding_stack[i], 0 );
878+ }
879+ }
880+
860881 // llava projector
861882 if (ctx->has_llava_projector ) {
862883 embeddings = ggml_reshape_2d (ctx0, embeddings, embeddings->ne [0 ], embeddings->ne [1 ]);
@@ -1443,14 +1464,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14431464 int idx = get_key_idx (ctx, KEY_IMAGE_GRID_PINPOINTS);
14441465 int n = gguf_get_arr_n (ctx, idx);
14451466 const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data (ctx, idx);
1446- for (int i = 0 ; i < 32 && i < n && pinpoints[i] != 0 ; ++i) {
1447- hparams.image_grid_pinpoints [i] = pinpoints[i];
1467+ for (int i = 0 ; i < n ; ++i) {
1468+ hparams.image_grid_pinpoints . push_back ( pinpoints[i]) ;
14481469 }
1449- if (n < 32 )
1450- hparams.image_grid_pinpoints [n] = 0 ;
1451- } catch (std::runtime_error & /* e*/ ) {
1452- hparams.image_grid_pinpoints [0 ]=0 ;
1453- }
1470+ } catch (std::runtime_error & /* e*/ ) { }
1471+
1472+ // Load the vision feature layer indices if they are explicitly provided;
1473+ // if multiple vision feature layers are present, the values will be concatenated
1474+ // to form the final visual features.
1475+ // NOTE: gguf conversions should standardize the values of the vision feature layer to
1476+ // be non-negative, since we use -1 to mark values as unset here.
1477+ try {
1478+ int idx = get_key_idx (ctx, KEY_FEATURE_LAYER);
1479+ int n = gguf_get_arr_n (ctx, idx);
1480+
1481+ const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data (ctx, idx);
1482+
1483+ for (int i = 0 ; i < n; ++i) {
1484+ hparams.vision_feature_layer .insert (vision_feature_layer[i]);
1485+ }
1486+ } catch (std::runtime_error & /* e*/ ) { }
14541487
14551488 try {
14561489 int idx = get_key_idx (ctx, KEY_MM_PATCH_MERGE_TYPE);
@@ -1476,6 +1509,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14761509 new_clip->image_std [i] = std_data[i];
14771510 }
14781511
1512+ // Calculate the deepest feature layer based on hparams and projector type
1513+ new_clip->max_feature_layer = get_deepest_feature_layer (new_clip);
1514+
14791515 if (verbosity >= 2 ) {
14801516 LOG_INF (" \n %s: vision model hparams\n " , __func__);
14811517 LOG_INF (" image_size %d\n " , hparams.image_size );
@@ -1489,8 +1525,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14891525 LOG_INF (" v_image_mean %f %f %f\n " , new_clip->image_mean [0 ], new_clip->image_mean [1 ], new_clip->image_mean [2 ]);
14901526 LOG_INF (" v_image_std %f %f %f\n " , new_clip->image_std [0 ], new_clip->image_std [1 ], new_clip->image_std [2 ]);
14911527 LOG_INF (" v_image_grid_pinpoints: " );
1492- for (int i = 0 ; i < 32 && (hparams.image_grid_pinpoints [i] != 0 ); ++i) {
1493- LOG_INF (" %d " , hparams.image_grid_pinpoints [i]);
1528+ for (const auto & pp : hparams.image_grid_pinpoints ) {
1529+ LOG_INF (" %d " , pp);
1530+ }
1531+ LOG_INF (" \n " );
1532+ LOG_INF (" v_vision_feature_layer: " );
1533+ for (const auto & feature_layer: hparams.vision_feature_layer ) {
1534+ LOG_INF (" %d " , feature_layer);
14941535 }
14951536 LOG_INF (" \n " );
14961537 LOG_INF (" v_mm_patch_merge_type: %s\n " , hparams.mm_patch_merge_type );
@@ -2235,10 +2276,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
22352276 }
22362277 }
22372278 } else {
2238- if (params.image_grid_pinpoints [ 0 ] != 0 ) {
2279+ if (! params.image_grid_pinpoints . empty () ) {
22392280 // "spatial_unpad" with "anyres" processing for llava-1.6
22402281 std::vector<std::pair<int , int >> possible_resolutions;
2241- for (int i = 0 ; i < 32 && params.image_grid_pinpoints [i] != 0 ; i+=2 ) {
2282+ for (size_t i = 0 ; i < params.image_grid_pinpoints . size () ; i+=2 ) {
22422283 possible_resolutions.push_back ({params.image_grid_pinpoints [i], params.image_grid_pinpoints [i+1 ]});
22432284 }
22442285 std::pair<int , int > best_resolution = select_best_resolution ({img->nx , img->ny }, possible_resolutions);
@@ -2404,7 +2445,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
24042445}
24052446
24062447const int32_t * clip_image_grid (const struct clip_ctx * ctx) {
2407- return ctx->vision_model .hparams .image_grid_pinpoints ;
2448+ if (ctx->vision_model .hparams .image_grid_pinpoints .size ()) {
2449+ return &ctx->vision_model .hparams .image_grid_pinpoints .front ();
2450+ }
2451+ return nullptr ;
2452+ }
2453+
2454+ size_t get_clip_image_grid_size (const struct clip_ctx * ctx) {
2455+ return ctx->vision_model .hparams .image_grid_pinpoints .size ();
24082456}
24092457
24102458int clip_n_patches (const struct clip_ctx * ctx) {
@@ -2929,6 +2977,28 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
29292977 return ctx->has_qwen2vl_merger ;
29302978}
29312979
2980+ // Determine the number of encoder layers to iterate over
2981+ int get_deepest_feature_layer (const struct clip_ctx * ctx) {
2982+ // Get the index of the second to last layer; this is the
2983+ // default for models that have a llava projector
2984+ const auto & hparams = ctx->vision_model .hparams ;
2985+ int n_layer = hparams.n_layer - 1 ;
2986+ int deepest_feature_layer = -1 ;
2987+
2988+ // Handle other projectors; incrementing here indicates that we
2989+ // should use the last encoder layer for the vision features.
2990+ if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger ) {
2991+ n_layer += 1 ;
2992+ }
2993+
2994+ // If we set explicit vision feature layers, only go up to the deepest one
2995+ for (const auto & feature_layer : hparams.vision_feature_layer ) {
2996+ if (feature_layer > deepest_feature_layer) {
2997+ deepest_feature_layer = feature_layer;
2998+ }
2999+ }
3000+ return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
3001+ }
29323002
29333003bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
29343004 clip_image_f32 clip_img;
0 commit comments