@@ -172,6 +172,11 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u
172172// clip layers
173173//
174174
175+ enum patch_merge_type {
176+ PATCH_MERGE_FLAT,
177+ PATCH_MERGE_SPATIAL_UNPAD,
178+ };
179+
175180struct clip_hparams {
176181 int32_t image_size;
177182 int32_t patch_size;
@@ -181,9 +186,9 @@ struct clip_hparams {
181186 int32_t n_head;
182187 int32_t n_layer;
183188
184- float eps ;
189+ patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT ;
185190
186- char mm_patch_merge_type[ 32 ] = " flat " ; // spatial_unpad or flat (default)
191+ float eps;
187192
188193 std::vector<int32_t > image_grid_pinpoints;
189194 int32_t image_crop_resolution;
@@ -1230,7 +1235,13 @@ struct clip_model_loader {
12301235 get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
12311236 get_arr_int (KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints , false );
12321237
1233- // TODO @ngxson : missing KEY_MM_PATCH_MERGE_TYPE
1238+ {
1239+ std::string mm_patch_merge_type;
1240+ get_string (KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false );
1241+ if (mm_patch_merge_type == " spatial_unpad" ) {
1242+ hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
1243+ }
1244+ }
12341245
12351246 {
12361247 int idx_mean = gguf_find_key (ctx_gguf.get (), KEY_IMAGE_MEAN);
@@ -2115,7 +2126,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
21152126 }
21162127 auto & params = ctx->vision_model .hparams ;
21172128 // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2118- if (strcmp ( params.mm_patch_merge_type , " spatial_unpad " ) == 0 ) {
2129+ if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ) {
21192130 pad_to_square = false ;
21202131 }
21212132 // free the previous res_imgs if any set
@@ -2311,7 +2322,7 @@ int32_t clip_hidden_size(const struct clip_ctx * ctx) {
23112322}
23122323
23132324const char * clip_patch_merge_type (const struct clip_ctx * ctx) {
2314- return ctx->vision_model .hparams .mm_patch_merge_type ;
2325+ return ctx->vision_model .hparams .mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? " spatial_unpad " : " flat " ;
23152326}
23162327
23172328const int32_t * clip_image_grid (const struct clip_ctx * ctx) {
0 commit comments