Use constant for max gridpoints / feat layers, style fixes

alex-jw-brooks · alex-jw-brooks · commit b1be52a7d5d8 · 2025-02-20T14:57:16.000-07:00
Signed-off-by: Alex-Brooks &lt;Alex.Brooks@ibm.com&gt;
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -120,7 +120,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
 #define KEY_PROJ_TYPE           "clip.projector_type"
-#define KEY_VISION_FEATURE_LAYER  "clip.vision.feature_layer"
+#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@@ -171,6 +171,11 @@ static std::string format(const char * fmt, ...) {
 #define TN_GLM_BOI_W "adapter.boi"
 #define TN_GLM_EOI_W "adapter.eoi"
 
+// Maximum number of flattened image grid pinpoints (i.e., double
+// the max number of ordered pairs) to be used for anyres
+#define MAX_IMAGE_GRID_PINPOINTS 64
+// Maximum number of encoder layers to be concatenated to form the features
+#define MAX_IMAGE_FEATURE_LAYERS 4
 
 enum projector_type {
     PROJECTOR_TYPE_MLP,
@@ -445,9 +450,9 @@ struct clip_hparams {
 
     char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
 
-    int32_t image_grid_pinpoints[64];
+    int32_t image_grid_pinpoints[MAX_IMAGE_GRID_PINPOINTS];
     int32_t image_crop_resolution;
-    int32_t vision_feature_layer[4];
+    int32_t vision_feature_layer[MAX_IMAGE_FEATURE_LAYERS];
 };
 
 struct clip_layer {
@@ -755,7 +760,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     }
 
     std::vector<struct ggml_tensor *> embedding_stack;
-    // Check to see we have 1+ set vision feature layers set; otherwise it's determined
+    // Check to see if we have 1+ set vision feature layers set; otherwise it's determined
     // by the type of projector that this model has (usually last or second to last layer).
     int max_feature_layer = get_deepest_feature_layer(ctx);
 
@@ -765,7 +770,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
         // If this is an embedding feature layer, save the output.
         // NOTE: 0 index here refers to the input to the encoder.
-        for(int vf_layer_idx = 0; vf_layer_idx < 4; vf_layer_idx++) {
+        for (int vf_layer_idx = 0; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx++) {
             if (il == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) {
                 embedding_stack.push_back(embeddings);
                 break;
@@ -870,17 +875,17 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     }
 
     // final layer is a vision feature layer
-    for(int vf_layer_idx = 0; vf_layer_idx < 4; vf_layer_idx++) {
+    for (int vf_layer_idx = 0; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx++) {
         if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) {
             embedding_stack.push_back(embeddings);
             break;
         }
     }
 
     // If feature layers are explicitly set, stack them (if we have multiple)
-    if(embedding_stack.size() > 0) {
+    if (embedding_stack.size() > 0) {
         embeddings = embedding_stack.at(0);
-        for(unsigned long i=1; i < embedding_stack.size(); i++) {
+        for (unsigned long i=1; i < embedding_stack.size(); i++) {
             embeddings = ggml_concat(ctx0, embeddings, embedding_stack.at(i), 0);
         }
     }
@@ -1471,10 +1476,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
             int n = gguf_get_arr_n(ctx, idx);
             const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
-            for (int i = 0; i < 64 && i < n && pinpoints[i] != 0; ++i) {
+            for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && i < n && pinpoints[i] != 0; ++i) {
                 hparams.image_grid_pinpoints[i] = pinpoints[i];
             }
-            if (n < 64)
+            if (n < MAX_IMAGE_GRID_PINPOINTS)
                 hparams.image_grid_pinpoints[n] = 0;
         } catch (std::runtime_error & /*e*/) {
             hparams.image_grid_pinpoints[0]=0;
@@ -1486,15 +1491,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         // NOTE: gguf conversions should standardize the values of the vision feature layer to uints,
         // since we use -1 as an unset value here.
         try {
-            int idx = get_key_idx(ctx, KEY_VISION_FEATURE_LAYER);
+            int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
             int n = gguf_get_arr_n(ctx, idx);
 
             const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
 
-            for (int i = 0; i < 4 && i < n && vision_feature_layer[i] != 0; ++i) {
+            for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && i < n && vision_feature_layer[i] != 0; ++i) {
                 hparams.vision_feature_layer[i] = vision_feature_layer[i];
             }
-            if (n < 4)
+            if (n < MAX_IMAGE_FEATURE_LAYERS)
                 hparams.vision_feature_layer[n] = -1;
         } catch (std::runtime_error & /*e*/) {
             hparams.vision_feature_layer[0] = -1;
@@ -1537,12 +1542,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
             LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
             LOG_INF("v_image_grid_pinpoints: ");
-            for (int i = 0; i < 64 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
+            for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && (hparams.image_grid_pinpoints[i] != 0); ++i) {
                 LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
             }
             LOG_INF("\n");
             LOG_INF("v_vision_feature_layer: ");
-            for(int i = 0; i < 4 && (hparams.vision_feature_layer[i] > 0); i++) {
+            for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[i] > 0); i++) {
                 LOG_INF("%d ", hparams.vision_feature_layer[i]);
             }
             LOG_INF("\n");
@@ -2291,7 +2296,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         if (params.image_grid_pinpoints[0] != 0) {
             // "spatial_unpad" with "anyres" processing for llava-1.6
             std::vector<std::pair<int, int>> possible_resolutions;
-            for (int i = 0; i < 64 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && params.image_grid_pinpoints[i] != 0; i+=2) {
                 possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
             }
             std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
@@ -2978,6 +2983,10 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
     return ctx->has_qwen2vl_merger;
 }
 
+size_t get_max_image_grid_pinpoints() {
+    return MAX_IMAGE_GRID_PINPOINTS;
+}
+
 // Determine the number of encoder layers to iterate over
 CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx) {
     // Get the index of the second to last layer; this is the
@@ -2992,8 +3001,8 @@ CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx) {
     }
 
     // If we set explicit vision feature layers, only go up to the deepest one
-    for(int i = 0; i < 4; i++) {
-        if(ctx->vision_model.hparams.vision_feature_layer[i] > deepest_feature_layer) {
+    for (int i = 0; i < 4; i++) {
+        if (ctx->vision_model.hparams.vision_feature_layer[i] > deepest_feature_layer) {
             deepest_feature_layer = ctx->vision_model.hparams.vision_feature_layer[i];
         }
     }
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
@@ -92,6 +92,7 @@ CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 
+CLIP_API size_t get_max_image_grid_pinpoints();
 CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
 
 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -355,7 +355,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         const int32_t * image_grid = clip_image_grid(ctx_clip);
 
         std::vector<std::pair<int, int>> grid_pinpoints;
-        for (int i = 0; i < 64 && image_grid[i] != 0; i += 2) {
+        for (size_t i = 0; i < get_max_image_grid_pinpoints() && image_grid[i] != 0; i += 2) {
             grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
         }
 

Original file line number	Diff line number	Diff line change
`@@ -355,7 +355,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli`
`355`	`355`	`const int32_t * image_grid = clip_image_grid(ctx_clip);`
`356`	`356`
`357`	`357`	`std::vector<std::pair<int, int>> grid_pinpoints;`
`358`		`- for (int i = 0; i < 64 && image_grid[i] != 0; i += 2) {`
	`358`	`+ for (size_t i = 0; i < get_max_image_grid_pinpoints() && image_grid[i] != 0; i += 2) {`
`359`	`359`	`grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});`
`360`	`360`	`}`
`361`	`361`