@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
175175
176176 model.ctx = ggml_init (params);
177177
178- struct ggml_tensor * image_features = ggml_new_tensor_3d (model.ctx , GGML_TYPE_F32, clip_n_mmproj_embd (ctx_clip), clip_n_patches (ctx_clip), num_images - 1 ); // example: 4096 x 576 x 4
178+ struct ggml_tensor * image_features = ggml_new_tensor_3d (model.ctx , GGML_TYPE_F32, clip_n_mmproj_embd (ctx_clip), clip_get_n_output_tokens (ctx_clip), num_images - 1 ); // example: 4096 x 576 x 4
179179 // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
180180 // fill it with the image embeddings, ignoring the base
181181 for (size_t i = 1 ; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
214214
215215 memcpy (image_embd_out, image_embd_v[0 ], clip_embd_nbytes (ctx_clip)); // main image as global context
216216 // append without newline tokens (default behavior in llava_arch when not using unpad ):
217- memcpy (image_embd_out + clip_n_patches (ctx_clip) * clip_n_mmproj_embd (ctx_clip), (float *)result->data , clip_embd_nbytes (ctx_clip) * (num_images-1 )); // grid patches
218- *n_img_pos_out = static_cast <int >(result->ne [1 ]+clip_n_patches (ctx_clip));
217+ memcpy (image_embd_out + clip_get_n_output_tokens (ctx_clip) * clip_n_mmproj_embd (ctx_clip), (float *)result->data , clip_embd_nbytes (ctx_clip) * (num_images-1 )); // grid patches
218+ *n_img_pos_out = static_cast <int >(result->ne [1 ]+clip_get_n_output_tokens (ctx_clip));
219219
220220 // Debug: Test single segments
221221 // Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
313313 image_embd + n_img_pos_out * clip_n_mmproj_embd (ctx_clip),
314314 image_embd_v[i],
315315 clip_embd_nbytes_by_img (ctx_clip, nx, ny));
316- n_img_pos_out += clip_n_patches_by_img (ctx_clip, img_res);
316+ n_img_pos_out += clip_img_f32_get_n_output_tokens (ctx_clip, img_res);
317317 }
318318 *n_img_pos = n_img_pos_out;
319319 for (size_t i = 0 ; i < image_embd_v.size (); i++) {
@@ -342,7 +342,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
342342 }
343343 else if (strcmp (mm_patch_merge_type, " spatial_unpad" ) != 0 ) {
344344 // flat / default llava-1.5 type embedding
345- *n_img_pos = clip_n_patches (ctx_clip);
345+ *n_img_pos = clip_get_n_output_tokens (ctx_clip);
346346 clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v.get (), 0 );
347347 bool encoded = clip_image_encode (ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
348348 if (!encoded) {
0 commit comments