Skip to content

Commit eeea35a

Browse files
committed
add KEY_MM_PATCH_MERGE_TYPE
1 parent 6fe6846 commit eeea35a

File tree

1 file changed

+16
-5
lines changed

1 file changed

+16
-5
lines changed

examples/llava/clip.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,11 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u
172172
// clip layers
173173
//
174174

175+
enum patch_merge_type {
176+
PATCH_MERGE_FLAT,
177+
PATCH_MERGE_SPATIAL_UNPAD,
178+
};
179+
175180
struct clip_hparams {
176181
int32_t image_size;
177182
int32_t patch_size;
@@ -181,9 +186,9 @@ struct clip_hparams {
181186
int32_t n_head;
182187
int32_t n_layer;
183188

184-
float eps;
189+
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
185190

186-
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
191+
float eps;
187192

188193
std::vector<int32_t> image_grid_pinpoints;
189194
int32_t image_crop_resolution;
@@ -1230,7 +1235,13 @@ struct clip_model_loader {
12301235
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
12311236
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
12321237

1233-
// TODO @ngxson : missing KEY_MM_PATCH_MERGE_TYPE
1238+
{
1239+
std::string mm_patch_merge_type;
1240+
get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
1241+
if (mm_patch_merge_type == "spatial_unpad") {
1242+
hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
1243+
}
1244+
}
12341245

12351246
{
12361247
int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
@@ -2115,7 +2126,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
21152126
}
21162127
auto & params = ctx->vision_model.hparams;
21172128
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2118-
if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
2129+
if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
21192130
pad_to_square = false;
21202131
}
21212132
// free the previous res_imgs if any set
@@ -2311,7 +2322,7 @@ int32_t clip_hidden_size(const struct clip_ctx * ctx) {
23112322
}
23122323

23132324
const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
2314-
return ctx->vision_model.hparams.mm_patch_merge_type;
2325+
return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
23152326
}
23162327

23172328
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {

0 commit comments

Comments
 (0)