@@ -4114,7 +4114,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41144114 clip_image_u8_ptr temp (clip_image_u8_init ()); // we will keep the input image data here temporarily
41154115
41164116 // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
4117- if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ) { // pad_to_square
4117+ if (params.image_res_candidates . empty () ) { // pad_to_square
41184118 // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
41194119 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
41204120 const int longer_side = std::max (img->nx , img->ny );
@@ -4131,9 +4131,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41314131 clip_image_f32_ptr res (clip_image_f32_init ());
41324132 normalize_image_u8_to_f32 (*temp, *res, params.image_mean , params.image_std );
41334133 res_imgs->entries .push_back (std::move (res));
4134- return true ;
41354134
4136- } else if (!params. image_res_candidates . empty ()) {
4135+ } else {
41374136 // "spatial_unpad" with "anyres" processing for llava-1.6
41384137 auto const inst = llava_uhd::get_slice_instructions (ctx, original_size);
41394138 std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image (img, inst);
@@ -4144,8 +4143,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41444143 normalize_image_u8_to_f32 (*imgs[i], *res, params.image_mean , params.image_std );
41454144 res_imgs->entries .push_back (std::move (res));
41464145 }
4147-
4148- return true ;
41494146 }
41504147 } break ;
41514148
0 commit comments