Skip to content

Commit b1be52a

Browse files
Use constant for max gridpoints / feat layers, style fixes
Signed-off-by: Alex-Brooks <[email protected]>
1 parent 48a941d commit b1be52a

File tree

3 files changed

+29
-19
lines changed

3 files changed

+29
-19
lines changed

examples/llava/clip.cpp

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ static std::string format(const char * fmt, ...) {
120120
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
121121
#define KEY_IMAGE_STD "clip.vision.image_std"
122122
#define KEY_PROJ_TYPE "clip.projector_type"
123-
#define KEY_VISION_FEATURE_LAYER "clip.vision.feature_layer"
123+
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
124124

125125
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
126126
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -171,6 +171,11 @@ static std::string format(const char * fmt, ...) {
171171
#define TN_GLM_BOI_W "adapter.boi"
172172
#define TN_GLM_EOI_W "adapter.eoi"
173173

174+
// Maximum number of flattened image grid pinpoints (i.e., double
175+
// the max number of ordered pairs) to be used for anyres
176+
#define MAX_IMAGE_GRID_PINPOINTS 64
177+
// Maximum number of encoder layers to be concatenated to form the features
178+
#define MAX_IMAGE_FEATURE_LAYERS 4
174179

175180
enum projector_type {
176181
PROJECTOR_TYPE_MLP,
@@ -445,9 +450,9 @@ struct clip_hparams {
445450

446451
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
447452

448-
int32_t image_grid_pinpoints[64];
453+
int32_t image_grid_pinpoints[MAX_IMAGE_GRID_PINPOINTS];
449454
int32_t image_crop_resolution;
450-
int32_t vision_feature_layer[4];
455+
int32_t vision_feature_layer[MAX_IMAGE_FEATURE_LAYERS];
451456
};
452457

453458
struct clip_layer {
@@ -755,7 +760,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
755760
}
756761

757762
std::vector<struct ggml_tensor *> embedding_stack;
758-
// Check to see we have 1+ set vision feature layers set; otherwise it's determined
763+
// Check to see if we have 1+ set vision feature layers set; otherwise it's determined
759764
// by the type of projector that this model has (usually last or second to last layer).
760765
int max_feature_layer = get_deepest_feature_layer(ctx);
761766

@@ -765,7 +770,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
765770

766771
// If this is an embedding feature layer, save the output.
767772
// NOTE: 0 index here refers to the input to the encoder.
768-
for(int vf_layer_idx = 0; vf_layer_idx < 4; vf_layer_idx++) {
773+
for (int vf_layer_idx = 0; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx++) {
769774
if (il == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) {
770775
embedding_stack.push_back(embeddings);
771776
break;
@@ -870,17 +875,17 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
870875
}
871876

872877
// final layer is a vision feature layer
873-
for(int vf_layer_idx = 0; vf_layer_idx < 4; vf_layer_idx++) {
878+
for (int vf_layer_idx = 0; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx++) {
874879
if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) {
875880
embedding_stack.push_back(embeddings);
876881
break;
877882
}
878883
}
879884

880885
// If feature layers are explicitly set, stack them (if we have multiple)
881-
if(embedding_stack.size() > 0) {
886+
if (embedding_stack.size() > 0) {
882887
embeddings = embedding_stack.at(0);
883-
for(unsigned long i=1; i < embedding_stack.size(); i++) {
888+
for (unsigned long i=1; i < embedding_stack.size(); i++) {
884889
embeddings = ggml_concat(ctx0, embeddings, embedding_stack.at(i), 0);
885890
}
886891
}
@@ -1471,10 +1476,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14711476
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
14721477
int n = gguf_get_arr_n(ctx, idx);
14731478
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
1474-
for (int i = 0; i < 64 && i < n && pinpoints[i] != 0; ++i) {
1479+
for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && i < n && pinpoints[i] != 0; ++i) {
14751480
hparams.image_grid_pinpoints[i] = pinpoints[i];
14761481
}
1477-
if (n < 64)
1482+
if (n < MAX_IMAGE_GRID_PINPOINTS)
14781483
hparams.image_grid_pinpoints[n] = 0;
14791484
} catch (std::runtime_error & /*e*/) {
14801485
hparams.image_grid_pinpoints[0]=0;
@@ -1486,15 +1491,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14861491
// NOTE: gguf conversions should standardize the values of the vision feature layer to uints,
14871492
// since we use -1 as an unset value here.
14881493
try {
1489-
int idx = get_key_idx(ctx, KEY_VISION_FEATURE_LAYER);
1494+
int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
14901495
int n = gguf_get_arr_n(ctx, idx);
14911496

14921497
const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
14931498

1494-
for (int i = 0; i < 4 && i < n && vision_feature_layer[i] != 0; ++i) {
1499+
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && i < n && vision_feature_layer[i] != 0; ++i) {
14951500
hparams.vision_feature_layer[i] = vision_feature_layer[i];
14961501
}
1497-
if (n < 4)
1502+
if (n < MAX_IMAGE_FEATURE_LAYERS)
14981503
hparams.vision_feature_layer[n] = -1;
14991504
} catch (std::runtime_error & /*e*/) {
15001505
hparams.vision_feature_layer[0] = -1;
@@ -1537,12 +1542,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
15371542
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
15381543
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
15391544
LOG_INF("v_image_grid_pinpoints: ");
1540-
for (int i = 0; i < 64 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
1545+
for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && (hparams.image_grid_pinpoints[i] != 0); ++i) {
15411546
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
15421547
}
15431548
LOG_INF("\n");
15441549
LOG_INF("v_vision_feature_layer: ");
1545-
for(int i = 0; i < 4 && (hparams.vision_feature_layer[i] > 0); i++) {
1550+
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[i] > 0); i++) {
15461551
LOG_INF("%d ", hparams.vision_feature_layer[i]);
15471552
}
15481553
LOG_INF("\n");
@@ -2291,7 +2296,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
22912296
if (params.image_grid_pinpoints[0] != 0) {
22922297
// "spatial_unpad" with "anyres" processing for llava-1.6
22932298
std::vector<std::pair<int, int>> possible_resolutions;
2294-
for (int i = 0; i < 64 && params.image_grid_pinpoints[i] != 0; i+=2) {
2299+
for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && params.image_grid_pinpoints[i] != 0; i+=2) {
22952300
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
22962301
}
22972302
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
@@ -2978,6 +2983,10 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
29782983
return ctx->has_qwen2vl_merger;
29792984
}
29802985

2986+
size_t get_max_image_grid_pinpoints() {
2987+
return MAX_IMAGE_GRID_PINPOINTS;
2988+
}
2989+
29812990
// Determine the number of encoder layers to iterate over
29822991
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx) {
29832992
// Get the index of the second to last layer; this is the
@@ -2992,8 +3001,8 @@ CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx) {
29923001
}
29933002

29943003
// If we set explicit vision feature layers, only go up to the deepest one
2995-
for(int i = 0; i < 4; i++) {
2996-
if(ctx->vision_model.hparams.vision_feature_layer[i] > deepest_feature_layer) {
3004+
for (int i = 0; i < 4; i++) {
3005+
if (ctx->vision_model.hparams.vision_feature_layer[i] > deepest_feature_layer) {
29973006
deepest_feature_layer = ctx->vision_model.hparams.vision_feature_layer[i];
29983007
}
29993008
}

examples/llava/clip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
9292
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
9393
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
9494

95+
CLIP_API size_t get_max_image_grid_pinpoints();
9596
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
9697

9798
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

examples/llava/llava.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
355355
const int32_t * image_grid = clip_image_grid(ctx_clip);
356356

357357
std::vector<std::pair<int, int>> grid_pinpoints;
358-
for (int i = 0; i < 64 && image_grid[i] != 0; i += 2) {
358+
for (size_t i = 0; i < get_max_image_grid_pinpoints() && image_grid[i] != 0; i += 2) {
359359
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
360360
}
361361

0 commit comments

Comments
 (0)