@@ -120,7 +120,7 @@ static std::string format(const char * fmt, ...) {
120120#define KEY_IMAGE_MEAN " clip.vision.image_mean"
121121#define KEY_IMAGE_STD " clip.vision.image_std"
122122#define KEY_PROJ_TYPE " clip.projector_type"
123- #define KEY_VISION_FEATURE_LAYER " clip.vision.feature_layer"
123+ #define KEY_FEATURE_LAYER " clip.vision.feature_layer"
124124
125125#define KEY_MM_PATCH_MERGE_TYPE " clip.vision.mm_patch_merge_type"
126126#define KEY_IMAGE_GRID_PINPOINTS " clip.vision.image_grid_pinpoints"
@@ -171,6 +171,11 @@ static std::string format(const char * fmt, ...) {
171171#define TN_GLM_BOI_W " adapter.boi"
172172#define TN_GLM_EOI_W " adapter.eoi"
173173
174+ // Maximum number of flattened image grid pinpoints (i.e., double
175+ // the max number of ordered pairs) to be used for anyres
176+ #define MAX_IMAGE_GRID_PINPOINTS 64
177+ // Maximum number of encoder layers to be concatenated to form the features
178+ #define MAX_IMAGE_FEATURE_LAYERS 4
174179
175180enum projector_type {
176181 PROJECTOR_TYPE_MLP,
@@ -445,9 +450,9 @@ struct clip_hparams {
445450
446451 char mm_patch_merge_type[32 ] = " flat" ; // spatial_unpad or flat (default)
447452
448- int32_t image_grid_pinpoints[64 ];
453+ int32_t image_grid_pinpoints[MAX_IMAGE_GRID_PINPOINTS ];
449454 int32_t image_crop_resolution;
450- int32_t vision_feature_layer[4 ];
455+ int32_t vision_feature_layer[MAX_IMAGE_FEATURE_LAYERS ];
451456};
452457
453458struct clip_layer {
@@ -755,7 +760,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
755760 }
756761
757762 std::vector<struct ggml_tensor *> embedding_stack;
758- // Check to see we have 1+ set vision feature layers set; otherwise it's determined
763+ // Check to see if we have 1+ set vision feature layers set; otherwise it's determined
759764 // by the type of projector that this model has (usually last or second to last layer).
760765 int max_feature_layer = get_deepest_feature_layer (ctx);
761766
@@ -765,7 +770,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
765770
766771 // If this is an embedding feature layer, save the output.
767772 // NOTE: 0 index here refers to the input to the encoder.
768- for (int vf_layer_idx = 0 ; vf_layer_idx < 4 ; vf_layer_idx++) {
773+ for (int vf_layer_idx = 0 ; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS ; vf_layer_idx++) {
769774 if (il == ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx]) {
770775 embedding_stack.push_back (embeddings);
771776 break ;
@@ -870,17 +875,17 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
870875 }
871876
872877 // final layer is a vision feature layer
873- for (int vf_layer_idx = 0 ; vf_layer_idx < 4 ; vf_layer_idx++) {
878+ for (int vf_layer_idx = 0 ; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS ; vf_layer_idx++) {
874879 if (n_layer == ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx]) {
875880 embedding_stack.push_back (embeddings);
876881 break ;
877882 }
878883 }
879884
880885 // If feature layers are explicitly set, stack them (if we have multiple)
881- if (embedding_stack.size () > 0 ) {
886+ if (embedding_stack.size () > 0 ) {
882887 embeddings = embedding_stack.at (0 );
883- for (unsigned long i=1 ; i < embedding_stack.size (); i++) {
888+ for (unsigned long i=1 ; i < embedding_stack.size (); i++) {
884889 embeddings = ggml_concat (ctx0, embeddings, embedding_stack.at (i), 0 );
885890 }
886891 }
@@ -1471,10 +1476,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14711476 int idx = get_key_idx (ctx, KEY_IMAGE_GRID_PINPOINTS);
14721477 int n = gguf_get_arr_n (ctx, idx);
14731478 const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data (ctx, idx);
1474- for (int i = 0 ; i < 64 && i < n && pinpoints[i] != 0 ; ++i) {
1479+ for (int i = 0 ; i < MAX_IMAGE_GRID_PINPOINTS && i < n && pinpoints[i] != 0 ; ++i) {
14751480 hparams.image_grid_pinpoints [i] = pinpoints[i];
14761481 }
1477- if (n < 64 )
1482+ if (n < MAX_IMAGE_GRID_PINPOINTS )
14781483 hparams.image_grid_pinpoints [n] = 0 ;
14791484 } catch (std::runtime_error & /* e*/ ) {
14801485 hparams.image_grid_pinpoints [0 ]=0 ;
@@ -1486,15 +1491,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14861491 // NOTE: gguf conversions should standardize the values of the vision feature layer to uints,
14871492 // since we use -1 as an unset value here.
14881493 try {
1489- int idx = get_key_idx (ctx, KEY_VISION_FEATURE_LAYER );
1494+ int idx = get_key_idx (ctx, KEY_FEATURE_LAYER );
14901495 int n = gguf_get_arr_n (ctx, idx);
14911496
14921497 const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data (ctx, idx);
14931498
1494- for (int i = 0 ; i < 4 && i < n && vision_feature_layer[i] != 0 ; ++i) {
1499+ for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS && i < n && vision_feature_layer[i] != 0 ; ++i) {
14951500 hparams.vision_feature_layer [i] = vision_feature_layer[i];
14961501 }
1497- if (n < 4 )
1502+ if (n < MAX_IMAGE_FEATURE_LAYERS )
14981503 hparams.vision_feature_layer [n] = -1 ;
14991504 } catch (std::runtime_error & /* e*/ ) {
15001505 hparams.vision_feature_layer [0 ] = -1 ;
@@ -1537,12 +1542,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
15371542 LOG_INF (" v_image_mean %f %f %f\n " , new_clip->image_mean [0 ], new_clip->image_mean [1 ], new_clip->image_mean [2 ]);
15381543 LOG_INF (" v_image_std %f %f %f\n " , new_clip->image_std [0 ], new_clip->image_std [1 ], new_clip->image_std [2 ]);
15391544 LOG_INF (" v_image_grid_pinpoints: " );
1540- for (int i = 0 ; i < 64 && (hparams.image_grid_pinpoints [i] != 0 ); ++i) {
1545+ for (int i = 0 ; i < MAX_IMAGE_GRID_PINPOINTS && (hparams.image_grid_pinpoints [i] != 0 ); ++i) {
15411546 LOG_INF (" %d " , hparams.image_grid_pinpoints [i]);
15421547 }
15431548 LOG_INF (" \n " );
15441549 LOG_INF (" v_vision_feature_layer: " );
1545- for (int i = 0 ; i < 4 && (hparams.vision_feature_layer [i] > 0 ); i++) {
1550+ for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer [i] > 0 ); i++) {
15461551 LOG_INF (" %d " , hparams.vision_feature_layer [i]);
15471552 }
15481553 LOG_INF (" \n " );
@@ -2291,7 +2296,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
22912296 if (params.image_grid_pinpoints [0 ] != 0 ) {
22922297 // "spatial_unpad" with "anyres" processing for llava-1.6
22932298 std::vector<std::pair<int , int >> possible_resolutions;
2294- for (int i = 0 ; i < 64 && params.image_grid_pinpoints [i] != 0 ; i+=2 ) {
2299+ for (int i = 0 ; i < MAX_IMAGE_GRID_PINPOINTS && params.image_grid_pinpoints [i] != 0 ; i+=2 ) {
22952300 possible_resolutions.push_back ({params.image_grid_pinpoints [i], params.image_grid_pinpoints [i+1 ]});
22962301 }
22972302 std::pair<int , int > best_resolution = select_best_resolution ({img->nx , img->ny }, possible_resolutions);
@@ -2978,6 +2983,10 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
29782983 return ctx->has_qwen2vl_merger ;
29792984}
29802985
2986+ size_t get_max_image_grid_pinpoints () {
2987+ return MAX_IMAGE_GRID_PINPOINTS;
2988+ }
2989+
29812990// Determine the number of encoder layers to iterate over
29822991CLIP_API int get_deepest_feature_layer (const struct clip_ctx * ctx) {
29832992 // Get the index of the second to last layer; this is the
@@ -2992,8 +3001,8 @@ CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx) {
29923001 }
29933002
29943003 // If we set explicit vision feature layers, only go up to the deepest one
2995- for (int i = 0 ; i < 4 ; i++) {
2996- if (ctx->vision_model .hparams .vision_feature_layer [i] > deepest_feature_layer) {
3004+ for (int i = 0 ; i < 4 ; i++) {
3005+ if (ctx->vision_model .hparams .vision_feature_layer [i] > deepest_feature_layer) {
29973006 deepest_feature_layer = ctx->vision_model .hparams .vision_feature_layer [i];
29983007 }
29993008 }
0 commit comments