Skip to content

Commit 1395a4a

Browse files
committed
simplify logic for llava-1.5
1 parent dd08673 commit 1395a4a

File tree

1 file changed

+43
-122
lines changed

1 file changed

+43
-122
lines changed

examples/llava/clip.cpp

Lines changed: 43 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -1797,9 +1797,9 @@ struct image_manipulation {
17971797
}
17981798

17991799
// llava-1.6 type of resize_and_pad
1800-
// if the ratio is not 1:1, padding with fill_color will be applied
1801-
// fill_color is single channel, default is 0 (black)
1802-
static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t, 3> fill_color = {0, 0, 0}) {
1800+
// if the ratio is not 1:1, padding with pad_color will be applied
1801+
// pad_color is single channel, default is 0 (black)
1802+
static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
18031803
int target_width = target_resolution.width;
18041804
int target_height = target_resolution.height;
18051805

@@ -1826,9 +1826,9 @@ struct image_manipulation {
18261826

18271827
// Fill the padded image with the fill color
18281828
for (size_t i = 0; i < padded_image.buf.size(); i += 3) {
1829-
padded_image.buf[i] = fill_color[0];
1830-
padded_image.buf[i + 1] = fill_color[1];
1831-
padded_image.buf[i + 2] = fill_color[2];
1829+
padded_image.buf[i] = pad_color[0];
1830+
padded_image.buf[i + 1] = pad_color[1];
1831+
padded_image.buf[i + 2] = pad_color[2];
18321832
}
18331833

18341834
// Calculate padding offsets
@@ -2154,7 +2154,18 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
21542154
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
21552155
// res_imgs memory is being allocated here, previous allocations will be freed if found
21562156
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
2157+
if (!ctx->has_vision_encoder) {
2158+
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
2159+
return false;
2160+
}
2161+
21572162
clip_image_size original_size{img->nx, img->ny};
2163+
bool pad_to_square = true;
2164+
auto & params = ctx->vision_model.hparams;
2165+
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2166+
if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
2167+
pad_to_square = false;
2168+
}
21582169

21592170
if (clip_is_minicpmv(ctx)) {
21602171
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
@@ -2185,7 +2196,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
21852196

21862197
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
21872198
clip_image_u8 resized_image;
2188-
int32_t sz = ctx->vision_model.hparams.image_size;
2199+
int sz = params.image_size;
21892200
image_manipulation::bicubic_resize(*img, resized_image, sz, sz);
21902201
clip_image_f32_ptr img_f32(clip_image_f32_init());
21912202
//clip_image_save_to_bmp(resized_image, "resized.bmp");
@@ -2194,137 +2205,47 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
21942205
return true;
21952206
}
21962207

2197-
bool pad_to_square = true;
2198-
if (!ctx->has_vision_encoder) {
2199-
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
2200-
return false;
2201-
}
2202-
auto & params = ctx->vision_model.hparams;
2203-
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
2204-
if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
2205-
pad_to_square = false;
2206-
}
2207-
// free the previous res_imgs if any set
2208-
res_imgs->entries.clear();
2209-
22102208
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
22112209
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
22122210

22132211
clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
2214-
if (pad_to_square && img->nx != img->ny) {
2215-
int longer_side = std::max(img->nx, img->ny);
2212+
2213+
if (pad_to_square) {
2214+
// for llava-1.5, we resize image to a square, and pad the shorter side with a background color
2215+
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
2216+
const int longer_side = std::max(img->nx, img->ny);
22162217
temp->nx = longer_side;
22172218
temp->ny = longer_side;
22182219
temp->buf.resize(3 * longer_side * longer_side);
2219-
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
2220-
2221-
// fill with background color
2222-
for (size_t i = 0; i < temp->buf.size(); i++) {
2223-
temp->buf[i] = bc[i % 3];
2224-
}
2225-
2226-
// copy from the input image
2227-
for (int y = 0; y < img->ny; y++) {
2228-
for (int x = 0; x < img->nx; x++) {
2229-
const int i = 3 * (y * img->nx + x);
2230-
const int j = 3 * (y * temp->nx + x);
2231-
temp->buf[j] = img->buf[i];
2232-
temp->buf[j+1] = img->buf[i+1];
2233-
temp->buf[j+2] = img->buf[i+2];
2234-
}
2235-
}
2236-
} else {
2237-
if (!params.image_grid_pinpoints.empty()) {
2238-
// "spatial_unpad" with "anyres" processing for llava-1.6
2239-
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
2240-
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
2241-
2242-
for (size_t i = 0; i < imgs.size(); ++i) {
2243-
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
2244-
clip_image_f32_ptr res(clip_image_f32_init());
2245-
normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
2246-
res_imgs->entries.push_back(std::move(res));
2247-
}
2248-
2249-
return true;
2250-
} else {
2251-
temp->nx = img->nx;
2252-
temp->ny = img->ny;
2253-
temp->buf.resize(img->buf.size());
2254-
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
2255-
}
2256-
}
2257-
2258-
const int nx = temp->nx;
2259-
const int ny = temp->ny;
2260-
// clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
2261-
2262-
const int nx2 = ctx->vision_model.hparams.image_size;
2263-
const int ny2 = ctx->vision_model.hparams.image_size;
2264-
clip_image_f32_ptr res(clip_image_f32_init());
2265-
res->nx = nx2;
2266-
res->ny = ny2;
2267-
res->buf.resize(3 * nx2 * ny2);
2268-
2269-
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
2270-
2271-
const int nx3 = int(nx / scale + 0.5f);
2272-
const int ny3 = int(ny / scale + 0.5f);
2273-
2274-
const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
2275-
const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f};
2276-
2277-
for (int y = 0; y < ny3; y++) {
2278-
for (int x = 0; x < nx3; x++) {
2279-
for (int c = 0; c < 3; c++) {
2280-
// linear interpolation
2281-
const float sx = (x + 0.5f) * scale - 0.5f;
2282-
const float sy = (y + 0.5f) * scale - 0.5f;
2283-
2284-
const int x0 = std::max(0, (int)std::floor(sx));
2285-
const int y0 = std::max(0, (int)std::floor(sy));
2286-
2287-
const int x1 = std::min(x0 + 1, nx - 1);
2288-
const int y1 = std::min(y0 + 1, ny - 1);
2289-
2290-
const float dx = sx - x0;
2291-
const float dy = sy - y0;
2292-
2293-
const int j00 = 3 * (y0 * nx + x0) + c;
2294-
const int j01 = 3 * (y0 * nx + x1) + c;
2295-
const int j10 = 3 * (y1 * nx + x0) + c;
2296-
const int j11 = 3 * (y1 * nx + x1) + c;
22972220

2298-
const float v00 = temp->buf[j00];
2299-
const float v01 = temp->buf[j01];
2300-
const float v10 = temp->buf[j10];
2301-
const float v11 = temp->buf[j11];
2221+
// background color in RGB from LLaVA (this is the mean rgb color * 255)
2222+
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
23022223

2303-
const float v0 = v00 * (1.0f - dx) + v01 * dx;
2304-
const float v1 = v10 * (1.0f - dx) + v11 * dx;
2224+
// resize the image to the target_size
2225+
image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
23052226

2306-
const float v = v0 * (1.0f - dy) + v1 * dy;
2307-
2308-
const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
2227+
clip_image_f32_ptr res(clip_image_f32_init());
2228+
normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
2229+
res_imgs->entries.push_back(std::move(res));
2230+
return true;
23092231

2310-
const int i = 3 * (y * nx3 + x) + c;
2232+
} else if (!params.image_grid_pinpoints.empty()) {
2233+
// "spatial_unpad" with "anyres" processing for llava-1.6
2234+
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
2235+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
23112236

2312-
res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
2313-
}
2237+
for (size_t i = 0; i < imgs.size(); ++i) {
2238+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
2239+
clip_image_f32_ptr res(clip_image_f32_init());
2240+
normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
2241+
res_imgs->entries.push_back(std::move(res));
23142242
}
2315-
}
23162243

2317-
// {
2318-
// clip_image_u8 * temp2 = clip_image_u8_init();
2319-
// clip_image_convert_f32_to_u8(*res, *temp2);
2320-
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
2321-
// clip_image_u8_free(temp2);
2322-
// }
2323-
// res_imgs.push_back(res);
2244+
return true;
23242245

2325-
res_imgs->entries.push_back(std::move(res));
2246+
}
23262247

2327-
return true;
2248+
GGML_ASSERT(false && "Unknown image preprocessing type");
23282249
}
23292250

23302251
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {

0 commit comments

Comments
 (0)