@@ -171,11 +171,6 @@ static std::string format(const char * fmt, ...) {
171171#define TN_GLM_BOI_W " adapter.boi"
172172#define TN_GLM_EOI_W " adapter.eoi"
173173
174- // Maximum number of flattened image grid pinpoints (i.e., double
175- // the max number of ordered pairs) to be used for anyres
176- #define MAX_IMAGE_GRID_PINPOINTS 64
177- // Maximum number of encoder layers to be concatenated to form the features
178- #define MAX_IMAGE_FEATURE_LAYERS 4
179174
180175enum projector_type {
181176 PROJECTOR_TYPE_MLP,
@@ -450,9 +445,9 @@ struct clip_hparams {
450445
451446 char mm_patch_merge_type[32 ] = " flat" ; // spatial_unpad or flat (default)
452447
453- int32_t image_grid_pinpoints[MAX_IMAGE_GRID_PINPOINTS] ;
448+ std::vector< int32_t > image_grid_pinpoints;
454449 int32_t image_crop_resolution;
455- int32_t vision_feature_layer[MAX_IMAGE_FEATURE_LAYERS] ;
450+ std::vector< int32_t > vision_feature_layer;
456451};
457452
458453struct clip_layer {
@@ -770,7 +765,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
770765
771766 // If this is an embedding feature layer, save the output.
772767 // NOTE: 0 index here refers to the input to the encoder.
773- for (int vl_idx = 0 ; vl_idx < MAX_IMAGE_FEATURE_LAYERS && ( hparams.vision_feature_layer [vl_idx] > 0 ); vl_idx++) {
768+ for (size_t vl_idx = 0 ; vl_idx < hparams.vision_feature_layer . size ( ); vl_idx++) {
774769 if (il == ctx->vision_model .hparams .vision_feature_layer [vl_idx]) {
775770 embedding_stack.push_back (embeddings);
776771 break ;
@@ -875,18 +870,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
875870 }
876871
877872 // final layer is a vision feature layer
878- for (int vl_idx = 0 ; vl_idx < MAX_IMAGE_FEATURE_LAYERS && ( hparams.vision_feature_layer [vl_idx] > 0 ); vl_idx++) {
873+ for (size_t vl_idx = 0 ; vl_idx < hparams.vision_feature_layer . size ( ); vl_idx++) {
879874 if (n_layer == ctx->vision_model .hparams .vision_feature_layer [vl_idx]) {
880875 embedding_stack.push_back (embeddings);
881876 break ;
882877 }
883878 }
884879
885880 // If feature layers are explicitly set, stack them (if we have multiple)
886- if (embedding_stack.size () > 0 ) {
887- embeddings = embedding_stack. at ( 0 ) ;
881+ if (! embedding_stack.empty () ) {
882+ embeddings = embedding_stack[ 0 ] ;
888883 for (size_t i = 1 ; i < embedding_stack.size (); i++) {
889- embeddings = ggml_concat (ctx0, embeddings, embedding_stack. at (i) , 0 );
884+ embeddings = ggml_concat (ctx0, embeddings, embedding_stack[i] , 0 );
890885 }
891886 }
892887
@@ -1476,14 +1471,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14761471 int idx = get_key_idx (ctx, KEY_IMAGE_GRID_PINPOINTS);
14771472 int n = gguf_get_arr_n (ctx, idx);
14781473 const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data (ctx, idx);
1479- for (int i = 0 ; i < MAX_IMAGE_GRID_PINPOINTS && i < n && pinpoints[i] != 0 ; ++i) {
1480- hparams.image_grid_pinpoints [i] = pinpoints[i];
1474+ for (int i = 0 ; i < n ; ++i) {
1475+ hparams.image_grid_pinpoints . push_back ( pinpoints[i]) ;
14811476 }
1482- if (n < MAX_IMAGE_GRID_PINPOINTS)
1483- hparams.image_grid_pinpoints [n] = 0 ;
1484- } catch (std::runtime_error & /* e*/ ) {
1485- hparams.image_grid_pinpoints [0 ]=0 ;
1486- }
1477+ } catch (std::runtime_error & /* e*/ ) { }
14871478
14881479 // Load the vision feature layer indices if they are explicitly provided;
14891480 // if multiple vision feature layers are present, the values will be concatenated
@@ -1496,14 +1487,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14961487
14971488 const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data (ctx, idx);
14981489
1499- for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS && i < n && vision_feature_layer[i] >= 0 ; ++i) {
1500- hparams.vision_feature_layer [i] = vision_feature_layer[i];
1490+ for (int i = 0 ; i < n ; ++i) {
1491+ hparams.vision_feature_layer . push_back ( vision_feature_layer[i]) ;
15011492 }
1502- if (n < MAX_IMAGE_FEATURE_LAYERS)
1503- hparams.vision_feature_layer [n] = -1 ;
1504- } catch (std::runtime_error & /* e*/ ) {
1505- hparams.vision_feature_layer [0 ] = -1 ;
1506- }
1493+ } catch (std::runtime_error & /* e*/ ) { }
15071494
15081495 try {
15091496 int idx = get_key_idx (ctx, KEY_MM_PATCH_MERGE_TYPE);
@@ -1542,12 +1529,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
15421529 LOG_INF (" v_image_mean %f %f %f\n " , new_clip->image_mean [0 ], new_clip->image_mean [1 ], new_clip->image_mean [2 ]);
15431530 LOG_INF (" v_image_std %f %f %f\n " , new_clip->image_std [0 ], new_clip->image_std [1 ], new_clip->image_std [2 ]);
15441531 LOG_INF (" v_image_grid_pinpoints: " );
1545- for (int i = 0 ; i < MAX_IMAGE_GRID_PINPOINTS && ( hparams.image_grid_pinpoints [i] != 0 ); ++i) {
1532+ for (size_t i = 0 ; i < hparams.image_grid_pinpoints . size ( ); ++i) {
15461533 LOG_INF (" %d " , hparams.image_grid_pinpoints [i]);
15471534 }
15481535 LOG_INF (" \n " );
15491536 LOG_INF (" v_vision_feature_layer: " );
1550- for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS && ( hparams.vision_feature_layer [i] > 0 ); i++) {
1537+ for (size_t i = 0 ; i < hparams.vision_feature_layer . size ( ); i++) {
15511538 LOG_INF (" %d " , hparams.vision_feature_layer [i]);
15521539 }
15531540 LOG_INF (" \n " );
@@ -2293,10 +2280,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
22932280 }
22942281 }
22952282 } else {
2296- if (params.image_grid_pinpoints [ 0 ] != 0 ) {
2283+ if (! params.image_grid_pinpoints . empty () ) {
22972284 // "spatial_unpad" with "anyres" processing for llava-1.6
22982285 std::vector<std::pair<int , int >> possible_resolutions;
2299- for (int i = 0 ; i < MAX_IMAGE_GRID_PINPOINTS && params.image_grid_pinpoints [i] != 0 ; i+=2 ) {
2286+ for (size_t i = 0 ; i < params.image_grid_pinpoints . size () ; i+=2 ) {
23002287 possible_resolutions.push_back ({params.image_grid_pinpoints [i], params.image_grid_pinpoints [i+1 ]});
23012288 }
23022289 std::pair<int , int > best_resolution = select_best_resolution ({img->nx , img->ny }, possible_resolutions);
@@ -2462,7 +2449,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
24622449}
24632450
24642451const int32_t * clip_image_grid (const struct clip_ctx * ctx) {
2465- return ctx->vision_model .hparams .image_grid_pinpoints ;
2452+ if (ctx->vision_model .hparams .image_grid_pinpoints .size ()) {
2453+ return &ctx->vision_model .hparams .image_grid_pinpoints .front ();
2454+ }
2455+ return nullptr ;
2456+ }
2457+
2458+ size_t get_clip_image_grid_size (const struct clip_ctx * ctx) {
2459+ return ctx->vision_model .hparams .image_grid_pinpoints .size ();
24662460}
24672461
24682462int clip_n_patches (const struct clip_ctx * ctx) {
@@ -2983,10 +2977,6 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
29832977 return ctx->has_qwen2vl_merger ;
29842978}
29852979
2986- size_t get_max_image_grid_pinpoints () {
2987- return MAX_IMAGE_GRID_PINPOINTS;
2988- }
2989-
29902980// Determine the number of encoder layers to iterate over
29912981int get_deepest_feature_layer (const struct clip_ctx * ctx) {
29922982 // Get the index of the second to last layer; this is the
@@ -3002,7 +2992,7 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
30022992 }
30032993
30042994 // If we set explicit vision feature layers, only go up to the deepest one
3005- for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS && ( hparams.vision_feature_layer [i] > 0 ); i++) {
2995+ for (size_t i = 0 ; i < hparams.vision_feature_layer . size ( ); i++) {
30062996 if (hparams.vision_feature_layer [i] > deepest_feature_layer) {
30072997 deepest_feature_layer = hparams.vision_feature_layer [i];
30082998 }
0 commit comments