Skip to content

Commit 2f645bb

Browse files
committed
pixtral is working only on cpu, however the images are distorted
1 parent f1eb6c4 commit 2f645bb

File tree

3 files changed

+15
-0
lines changed

3 files changed

+15
-0
lines changed

examples/llava/clip.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3560,6 +3560,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
35603560
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
35613561
}
35623562

3563+
bool clip_is_pixtral(const struct clip_ctx * ctx) {
3564+
return ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
3565+
}
3566+
35633567
// Determine the number of encoder layers to iterate over
35643568
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
35653569
// Get the index of the second to last layer; this is the

examples/llava/clip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
113113
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
114114
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
115115
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
116+
CLIP_API bool clip_is_pixtral(const struct clip_ctx * ctx);
116117

117118
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
118119

examples/llava/llava.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,16 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
340340
return false;
341341
}
342342
}
343+
else if (clip_is_pixtral(ctx_clip)){
344+
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
345+
*n_img_pos = clip_n_patches_by_img(ctx_clip, img_res);
346+
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
347+
if (!encoded) {
348+
LOG_ERR("Unable to encode image\n");
349+
350+
return false;
351+
}
352+
}
343353
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
344354
// flat / default llava-1.5 type embedding
345355
*n_img_pos = clip_n_patches(ctx_clip);

0 commit comments

Comments
 (0)