@@ -1797,9 +1797,9 @@ struct image_manipulation {
17971797 }
17981798
17991799 // llava-1.6 type of resize_and_pad
1800- // if the ratio is not 1:1, padding with fill_color will be applied
1801- // fill_color is single channel, default is 0 (black)
1802- static void resize_and_pad_image (const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t , 3 > fill_color = {0 , 0 , 0 }) {
1800+ // if the ratio is not 1:1, padding with pad_color will be applied
1801+ // pad_color is single channel, default is 0 (black)
1802+ static void resize_and_pad_image (const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t , 3 > pad_color = {0 , 0 , 0 }) {
18031803 int target_width = target_resolution.width ;
18041804 int target_height = target_resolution.height ;
18051805
@@ -1826,9 +1826,9 @@ struct image_manipulation {
18261826
18271827 // Fill the padded image with the fill color
18281828 for (size_t i = 0 ; i < padded_image.buf .size (); i += 3 ) {
1829- padded_image.buf [i] = fill_color [0 ];
1830- padded_image.buf [i + 1 ] = fill_color [1 ];
1831- padded_image.buf [i + 2 ] = fill_color [2 ];
1829+ padded_image.buf [i] = pad_color [0 ];
1830+ padded_image.buf [i + 1 ] = pad_color [1 ];
1831+ padded_image.buf [i + 2 ] = pad_color [2 ];
18321832 }
18331833
18341834 // Calculate padding offsets
@@ -2154,7 +2154,18 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
21542154// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
21552155// res_imgs memory is being allocated here, previous allocations will be freed if found
21562156bool clip_image_preprocess (struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
2157+ if (!ctx->has_vision_encoder ) {
2158+ LOG_ERR (" %s: This gguf file seems to have no vision encoder\n " , __func__);
2159+ return false ;
2160+ }
2161+
21572162 clip_image_size original_size{img->nx , img->ny };
2163+ bool pad_to_square = true ;
2164+ auto & params = ctx->vision_model .hparams ;
2165+ // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2166+ if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
2167+ pad_to_square = false ;
2168+ }
21582169
21592170 if (clip_is_minicpmv (ctx)) {
21602171 auto const inst = llava_uhd::get_slice_instructions (ctx, original_size);
@@ -2185,7 +2196,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
21852196
21862197 if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
21872198 clip_image_u8 resized_image;
2188- int32_t sz = ctx-> vision_model . hparams .image_size ;
2199+ int sz = params .image_size ;
21892200 image_manipulation::bicubic_resize (*img, resized_image, sz, sz);
21902201 clip_image_f32_ptr img_f32 (clip_image_f32_init ());
21912202 // clip_image_save_to_bmp(resized_image, "resized.bmp");
@@ -2194,137 +2205,47 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
21942205 return true ;
21952206 }
21962207
2197- bool pad_to_square = true ;
2198- if (!ctx->has_vision_encoder ) {
2199- LOG_ERR (" %s: This gguf file seems to have no vision encoder\n " , __func__);
2200- return false ;
2201- }
2202- auto & params = ctx->vision_model .hparams ;
2203- // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2204- if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
2205- pad_to_square = false ;
2206- }
2207- // free the previous res_imgs if any set
2208- res_imgs->entries .clear ();
2209-
22102208 // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
22112209 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
22122210
22132211 clip_image_u8_ptr temp (clip_image_u8_init ()); // we will keep the input image data here temporarily
2214- if (pad_to_square && img->nx != img->ny ) {
2215- int longer_side = std::max (img->nx , img->ny );
2212+
2213+ if (pad_to_square) {
2214+ // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
2215+ // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
2216+ const int longer_side = std::max (img->nx , img->ny );
22162217 temp->nx = longer_side;
22172218 temp->ny = longer_side;
22182219 temp->buf .resize (3 * longer_side * longer_side);
2219- const uint8_t bc[3 ] = {122 , 116 , 104 }; // background color in RGB from LLaVA (this is the mean rgb color * 255)
2220-
2221- // fill with background color
2222- for (size_t i = 0 ; i < temp->buf .size (); i++) {
2223- temp->buf [i] = bc[i % 3 ];
2224- }
2225-
2226- // copy from the input image
2227- for (int y = 0 ; y < img->ny ; y++) {
2228- for (int x = 0 ; x < img->nx ; x++) {
2229- const int i = 3 * (y * img->nx + x);
2230- const int j = 3 * (y * temp->nx + x);
2231- temp->buf [j] = img->buf [i];
2232- temp->buf [j+1 ] = img->buf [i+1 ];
2233- temp->buf [j+2 ] = img->buf [i+2 ];
2234- }
2235- }
2236- } else {
2237- if (!params.image_grid_pinpoints .empty ()) {
2238- // "spatial_unpad" with "anyres" processing for llava-1.6
2239- auto const inst = llava_uhd::get_slice_instructions (ctx, original_size);
2240- std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image (img, inst);
2241-
2242- for (size_t i = 0 ; i < imgs.size (); ++i) {
2243- // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
2244- clip_image_f32_ptr res (clip_image_f32_init ());
2245- normalize_image_u8_to_f32 (*imgs[i], *res, ctx->image_mean , ctx->image_std );
2246- res_imgs->entries .push_back (std::move (res));
2247- }
2248-
2249- return true ;
2250- } else {
2251- temp->nx = img->nx ;
2252- temp->ny = img->ny ;
2253- temp->buf .resize (img->buf .size ());
2254- memcpy (temp->buf .data (), img->buf .data (), temp->buf .size ());
2255- }
2256- }
2257-
2258- const int nx = temp->nx ;
2259- const int ny = temp->ny ;
2260- // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
2261-
2262- const int nx2 = ctx->vision_model .hparams .image_size ;
2263- const int ny2 = ctx->vision_model .hparams .image_size ;
2264- clip_image_f32_ptr res (clip_image_f32_init ());
2265- res->nx = nx2;
2266- res->ny = ny2;
2267- res->buf .resize (3 * nx2 * ny2);
2268-
2269- const float scale = std::max (nx, ny) / (float )ctx->vision_model .hparams .image_size ;
2270-
2271- const int nx3 = int (nx / scale + 0 .5f );
2272- const int ny3 = int (ny / scale + 0 .5f );
2273-
2274- const auto & m3 = ctx->image_mean ; // {0.48145466f, 0.4578275f, 0.40821073f};
2275- const auto & s3 = ctx->image_std ; // {0.26862954f, 0.26130258f, 0.27577711f};
2276-
2277- for (int y = 0 ; y < ny3; y++) {
2278- for (int x = 0 ; x < nx3; x++) {
2279- for (int c = 0 ; c < 3 ; c++) {
2280- // linear interpolation
2281- const float sx = (x + 0 .5f ) * scale - 0 .5f ;
2282- const float sy = (y + 0 .5f ) * scale - 0 .5f ;
2283-
2284- const int x0 = std::max (0 , (int )std::floor (sx));
2285- const int y0 = std::max (0 , (int )std::floor (sy));
2286-
2287- const int x1 = std::min (x0 + 1 , nx - 1 );
2288- const int y1 = std::min (y0 + 1 , ny - 1 );
2289-
2290- const float dx = sx - x0;
2291- const float dy = sy - y0;
2292-
2293- const int j00 = 3 * (y0 * nx + x0) + c;
2294- const int j01 = 3 * (y0 * nx + x1) + c;
2295- const int j10 = 3 * (y1 * nx + x0) + c;
2296- const int j11 = 3 * (y1 * nx + x1) + c;
22972220
2298- const float v00 = temp->buf [j00];
2299- const float v01 = temp->buf [j01];
2300- const float v10 = temp->buf [j10];
2301- const float v11 = temp->buf [j11];
2221+ // background color in RGB from LLaVA (this is the mean rgb color * 255)
2222+ const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
23022223
2303- const float v0 = v00 * ( 1 . 0f - dx) + v01 * dx;
2304- const float v1 = v10 * ( 1 . 0f - dx) + v11 * dx ;
2224+ // resize the image to the target_size
2225+ image_manipulation::resize_and_pad_image (*img, *temp, clip_image_size{params. image_size , params. image_size }, pad_color) ;
23052226
2306- const float v = v0 * (1 .0f - dy) + v1 * dy;
2307-
2308- const uint8_t v2 = std::min (std::max (std::round (v), 0 .0f ), 255 .0f );
2227+ clip_image_f32_ptr res (clip_image_f32_init ());
2228+ normalize_image_u8_to_f32 (*temp, *res, ctx->image_mean , ctx->image_std );
2229+ res_imgs->entries .push_back (std::move (res));
2230+ return true ;
23092231
2310- const int i = 3 * (y * nx3 + x) + c;
2232+ } else if (!params.image_grid_pinpoints .empty ()) {
2233+ // "spatial_unpad" with "anyres" processing for llava-1.6
2234+ auto const inst = llava_uhd::get_slice_instructions (ctx, original_size);
2235+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image (img, inst);
23112236
2312- res->buf [i] = ((float (v2) / 255 .0f ) - m3[c]) / s3[c];
2313- }
2237+ for (size_t i = 0 ; i < imgs.size (); ++i) {
2238+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
2239+ clip_image_f32_ptr res (clip_image_f32_init ());
2240+ normalize_image_u8_to_f32 (*imgs[i], *res, ctx->image_mean , ctx->image_std );
2241+ res_imgs->entries .push_back (std::move (res));
23142242 }
2315- }
23162243
2317- // {
2318- // clip_image_u8 * temp2 = clip_image_u8_init();
2319- // clip_image_convert_f32_to_u8(*res, *temp2);
2320- // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
2321- // clip_image_u8_free(temp2);
2322- // }
2323- // res_imgs.push_back(res);
2244+ return true ;
23242245
2325- res_imgs-> entries . push_back ( std::move (res));
2246+ }
23262247
2327- return true ;
2248+ GGML_ASSERT ( false && " Unknown image preprocessing type " ) ;
23282249}
23292250
23302251ggml_tensor * clip_get_newline_tensor (const struct clip_ctx * ctx) {
0 commit comments