Skip to content

Commit 11149b6

Browse files
Use std vector for gridpoints and feature layers
Signed-off-by: Alex-Brooks <[email protected]>
1 parent 16a95d6 commit 11149b6

File tree

3 files changed

+29
-38
lines changed

3 files changed

+29
-38
lines changed

examples/llava/clip.cpp

Lines changed: 26 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -171,11 +171,6 @@ static std::string format(const char * fmt, ...) {
171171
#define TN_GLM_BOI_W "adapter.boi"
172172
#define TN_GLM_EOI_W "adapter.eoi"
173173

174-
// Maximum number of flattened image grid pinpoints (i.e., double
175-
// the max number of ordered pairs) to be used for anyres
176-
#define MAX_IMAGE_GRID_PINPOINTS 64
177-
// Maximum number of encoder layers to be concatenated to form the features
178-
#define MAX_IMAGE_FEATURE_LAYERS 4
179174

180175
enum projector_type {
181176
PROJECTOR_TYPE_MLP,
@@ -450,9 +445,9 @@ struct clip_hparams {
450445

451446
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
452447

453-
int32_t image_grid_pinpoints[MAX_IMAGE_GRID_PINPOINTS];
448+
std::vector<int32_t> image_grid_pinpoints;
454449
int32_t image_crop_resolution;
455-
int32_t vision_feature_layer[MAX_IMAGE_FEATURE_LAYERS];
450+
std::vector<int32_t> vision_feature_layer;
456451
};
457452

458453
struct clip_layer {
@@ -770,7 +765,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
770765

771766
// If this is an embedding feature layer, save the output.
772767
// NOTE: 0 index here refers to the input to the encoder.
773-
for (int vl_idx = 0; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[vl_idx] > 0); vl_idx++) {
768+
for (size_t vl_idx = 0; vl_idx < hparams.vision_feature_layer.size(); vl_idx++) {
774769
if (il == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) {
775770
embedding_stack.push_back(embeddings);
776771
break;
@@ -875,18 +870,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
875870
}
876871

877872
// final layer is a vision feature layer
878-
for (int vl_idx = 0; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[vl_idx] > 0); vl_idx++) {
873+
for (size_t vl_idx = 0; vl_idx < hparams.vision_feature_layer.size(); vl_idx++) {
879874
if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) {
880875
embedding_stack.push_back(embeddings);
881876
break;
882877
}
883878
}
884879

885880
// If feature layers are explicitly set, stack them (if we have multiple)
886-
if (embedding_stack.size() > 0) {
887-
embeddings = embedding_stack.at(0);
881+
if (!embedding_stack.empty()) {
882+
embeddings = embedding_stack[0];
888883
for (size_t i = 1; i < embedding_stack.size(); i++) {
889-
embeddings = ggml_concat(ctx0, embeddings, embedding_stack.at(i), 0);
884+
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
890885
}
891886
}
892887

@@ -1476,14 +1471,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14761471
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
14771472
int n = gguf_get_arr_n(ctx, idx);
14781473
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
1479-
for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && i < n && pinpoints[i] != 0; ++i) {
1480-
hparams.image_grid_pinpoints[i] = pinpoints[i];
1474+
for (int i = 0; i < n; ++i) {
1475+
hparams.image_grid_pinpoints.push_back(pinpoints[i]);
14811476
}
1482-
if (n < MAX_IMAGE_GRID_PINPOINTS)
1483-
hparams.image_grid_pinpoints[n] = 0;
1484-
} catch (std::runtime_error & /*e*/) {
1485-
hparams.image_grid_pinpoints[0]=0;
1486-
}
1477+
} catch (std::runtime_error & /*e*/) { }
14871478

14881479
// Load the vision feature layer indices if they are explicitly provided;
14891480
// if multiple vision feature layers are present, the values will be concatenated
@@ -1496,14 +1487,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14961487

14971488
const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
14981489

1499-
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && i < n && vision_feature_layer[i] >= 0; ++i) {
1500-
hparams.vision_feature_layer[i] = vision_feature_layer[i];
1490+
for (int i = 0; i < n; ++i) {
1491+
hparams.vision_feature_layer.push_back(vision_feature_layer[i]);
15011492
}
1502-
if (n < MAX_IMAGE_FEATURE_LAYERS)
1503-
hparams.vision_feature_layer[n] = -1;
1504-
} catch (std::runtime_error & /*e*/) {
1505-
hparams.vision_feature_layer[0] = -1;
1506-
}
1493+
} catch (std::runtime_error & /*e*/) { }
15071494

15081495
try {
15091496
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
@@ -1542,12 +1529,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
15421529
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
15431530
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
15441531
LOG_INF("v_image_grid_pinpoints: ");
1545-
for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && (hparams.image_grid_pinpoints[i] != 0); ++i) {
1532+
for (size_t i = 0; i < hparams.image_grid_pinpoints.size(); ++i) {
15461533
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
15471534
}
15481535
LOG_INF("\n");
15491536
LOG_INF("v_vision_feature_layer: ");
1550-
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[i] > 0); i++) {
1537+
for (size_t i = 0; i < hparams.vision_feature_layer.size(); i++) {
15511538
LOG_INF("%d ", hparams.vision_feature_layer[i]);
15521539
}
15531540
LOG_INF("\n");
@@ -2293,10 +2280,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
22932280
}
22942281
}
22952282
} else {
2296-
if (params.image_grid_pinpoints[0] != 0) {
2283+
if (!params.image_grid_pinpoints.empty()) {
22972284
// "spatial_unpad" with "anyres" processing for llava-1.6
22982285
std::vector<std::pair<int, int>> possible_resolutions;
2299-
for (int i = 0; i < MAX_IMAGE_GRID_PINPOINTS && params.image_grid_pinpoints[i] != 0; i+=2) {
2286+
for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
23002287
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
23012288
}
23022289
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
@@ -2462,7 +2449,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
24622449
}
24632450

24642451
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
2465-
return ctx->vision_model.hparams.image_grid_pinpoints;
2452+
if(ctx->vision_model.hparams.image_grid_pinpoints.size()) {
2453+
return &ctx->vision_model.hparams.image_grid_pinpoints.front();
2454+
}
2455+
return nullptr;
2456+
}
2457+
2458+
size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
2459+
return ctx->vision_model.hparams.image_grid_pinpoints.size();
24662460
}
24672461

24682462
int clip_n_patches(const struct clip_ctx * ctx) {
@@ -2983,10 +2977,6 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
29832977
return ctx->has_qwen2vl_merger;
29842978
}
29852979

2986-
size_t get_max_image_grid_pinpoints() {
2987-
return MAX_IMAGE_GRID_PINPOINTS;
2988-
}
2989-
29902980
// Determine the number of encoder layers to iterate over
29912981
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
29922982
// Get the index of the second to last layer; this is the
@@ -3002,7 +2992,7 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
30022992
}
30032993

30042994
// If we set explicit vision feature layers, only go up to the deepest one
3005-
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[i] > 0); i++) {
2995+
for (size_t i = 0; i < hparams.vision_feature_layer.size(); i++) {
30062996
if (hparams.vision_feature_layer[i] > deepest_feature_layer) {
30072997
deepest_feature_layer = hparams.vision_feature_layer[i];
30082998
}

examples/llava/clip.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
5555
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
5656

5757
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
58+
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
5859

5960
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
6061
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
@@ -92,7 +93,6 @@ CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
9293
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
9394
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
9495

95-
CLIP_API size_t get_max_image_grid_pinpoints();
9696
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
9797

9898
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

examples/llava/llava.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,9 +353,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
353353
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
354354

355355
const int32_t * image_grid = clip_image_grid(ctx_clip);
356+
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
356357

357358
std::vector<std::pair<int, int>> grid_pinpoints;
358-
for (size_t i = 0; i < get_max_image_grid_pinpoints() && image_grid[i] != 0; i += 2) {
359+
for (size_t i = 0; i < num_gridpoints; i += 2) {
359360
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
360361
}
361362

0 commit comments

Comments
 (0)