1010#include " ggml-alloc.h"
1111#include " ggml-backend.h"
1212#include " gguf.h"
13- #if defined(ENABLE_ANE )
14- #include " ane/ane .h"
13+ #if defined(ENABLE_COREML )
14+ #include " coreml/mtmd_coreml .h"
1515#endif
1616
1717#include < cassert>
@@ -392,8 +392,8 @@ struct clip_ctx {
392392 bool debug_graph = false ;
393393 std::vector<ggml_tensor *> debug_print_tensors;
394394
395- // ANE model path for iOS
396- std::string ane_model_path ;
395+ // CoreML model path for iOS
396+ std::string coreml_model_path ;
397397
398398 clip_ctx (clip_context_params & ctx_params) {
399399 debug_graph = std::getenv (" MTMD_DEBUG_GRAPH" ) != nullptr ;
@@ -914,8 +914,6 @@ struct clip_graph {
914914 }
915915
916916 ggml_cgraph * build_minicpmv_embedding () {
917- const int batch_size = 1 ;
918-
919917 GGML_ASSERT (model.class_embedding == nullptr );
920918 const int n_pos = n_patches;
921919
@@ -3840,24 +3838,28 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
38403838 return pos_embed_2d;
38413839}
38423840
3843- #if defined(ENABLE_ANE)
3844- static bool clip_image_encode_ane (float * data, float * vec, const char * ane_model_path) {
3841+ #if defined(ENABLE_COREML)
3842+ // forward declarations
3843+ static bool coreml_embedding (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec);
3844+ static bool coreml_resampler (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec);
3845+
3846+ static bool clip_image_encode_coreml (float * data, float * vec, const char * coreml_model_path) {
38453847
38463848 static int flag = 0 ;
38473849 static const void * coremlEncoder = NULL ;
38483850 static std::string cached_model_path = " " ;
38493851
38503852 // Check if we need to load a new model
3851- if (flag == 0 || (ane_model_path && cached_model_path != ane_model_path )) {
3853+ if (flag == 0 || (coreml_model_path && cached_model_path != coreml_model_path )) {
38523854 if (coremlEncoder) {
38533855 closeModel (coremlEncoder);
38543856 }
3855- coremlEncoder = loadModel (ane_model_path );
3857+ coremlEncoder = loadModel (coreml_model_path );
38563858 if (!coremlEncoder) {
3857- printf (" Failed to load ANE model from: %s\n " , ane_model_path ? ane_model_path : " null" );
3859+ printf (" Failed to load CoreML model from: %s\n " , coreml_model_path ? coreml_model_path : " null" );
38583860 return false ;
38593861 }
3860- cached_model_path = ane_model_path ? ane_model_path : " " ;
3862+ cached_model_path = coreml_model_path ? coreml_model_path : " " ;
38613863 flag = 1 ;
38623864 }
38633865 predictWith (coremlEncoder, data, vec);
@@ -3871,27 +3873,30 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
38713873 *img_copy = *img;
38723874 imgs.entries .push_back (std::move (img_copy));
38733875
3874- #if defined(ENABLE_ANE )
3876+ #if defined(ENABLE_COREML )
38753877 bool ios_ctx = true ;
38763878 if (ios_ctx){
3877- printf (" clip use ane\n " );
3878- float * vit_embedding1 = (float *)malloc (1100 *1152 *sizeof (float ));
3879- float * vit_embedding2 = (float *)malloc (1100 *1152 *sizeof (float ));
3880-
3881- ane_embedding (ctx, n_threads, &imgs, vit_embedding1);
3882- clip_image_encode_ane (vit_embedding1, vit_embedding2, ctx->ane_model_path .c_str ());
3883- ane_resampler (ctx, n_threads, &imgs, vit_embedding2, vec);
3884- free (vit_embedding1);
3885- free (vit_embedding2);
3879+ printf (" clip use coreml\n " );
3880+ std::vector<float > vit_embedding1 (1100 *1152 );
3881+ std::vector<float > vit_embedding2 (1100 *1152 );
3882+
3883+ // call CoreML pipeline: embedding -> encoder -> resampler
3884+ if (!coreml_embedding (ctx, n_threads, &imgs, vit_embedding1.data ())) {
3885+ return false ;
3886+ }
3887+ clip_image_encode_coreml (vit_embedding1.data (), vit_embedding2.data (), ctx->coreml_model_path .c_str ());
3888+ if (!coreml_resampler (ctx, n_threads, &imgs, vit_embedding2.data (), vec)) {
3889+ return false ;
3890+ }
38863891 return true ;
38873892 }
38883893#endif
38893894
38903895 return clip_image_batch_encode (ctx, n_threads, &imgs, vec);
38913896}
38923897
3893- #if defined(ENABLE_ANE )
3894- static bool ane_embedding (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
3898+ #if defined(ENABLE_COREML )
3899+ static bool coreml_embedding (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
38953900 const clip_image_f32_batch & imgs = *imgs_c_ptr;
38963901 int batch_size = imgs.entries .size ();
38973902
@@ -3908,7 +3913,7 @@ static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_
39083913 clip_graph graph (ctx, *imgs.entries [0 ]);
39093914 ggml_cgraph * gf;
39103915 gf = graph.build_minicpmv_embedding ();
3911- ggml_backend_sched_alloc_graph (ctx->sched .get (), gf);
3916+ ggml_backend_sched_alloc_graph (ctx->sched .get (), gf);
39123917
39133918 // set inputs
39143919 const auto & model = ctx->model ;
@@ -3918,8 +3923,6 @@ static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_
39183923 const int image_size_height = imgs.entries [0 ]->ny ;
39193924
39203925 const int patch_size = hparams.patch_size ;
3921- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
3922- const int n_pos = num_patches + (model.class_embedding ? 1 : 0 );
39233926 const int pos_w = image_size_width / patch_size;
39243927 const int pos_h = image_size_height / patch_size;
39253928
@@ -4054,16 +4057,13 @@ static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_
40544057 // the last node is the embedding tensor
40554058 ggml_tensor * embeddings = ggml_graph_node (gf, -1 );
40564059
4057- // sanity check (only support batch size of 1 for now)
4058- const int n_tokens_out = embeddings->ne [1 ];
4059-
40604060 // copy the embeddings to the location passed by the user
40614061 ggml_backend_tensor_get (embeddings, vec, 0 , ggml_nbytes (embeddings));
40624062
40634063 return true ;
40644064}
40654065
4066- static bool ane_resampler (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) {
4066+ static bool coreml_resampler (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) {
40674067 const clip_image_f32_batch & imgs = *imgs_c_ptr;
40684068 int batch_size = imgs.entries .size ();
40694069
@@ -4090,8 +4090,6 @@ static bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_
40904090 const int image_size_height = imgs.entries [0 ]->ny ;
40914091
40924092 const int patch_size = hparams.patch_size ;
4093- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
4094- const int n_pos = num_patches + (model.class_embedding ? 1 : 0 );
40954093 const int pos_w = image_size_width / patch_size;
40964094 const int pos_h = image_size_height / patch_size;
40974095
@@ -4113,13 +4111,6 @@ static bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_
41134111 ggml_backend_tensor_set (cur, values.data (), 0 , ggml_nbytes (cur));
41144112 };
41154113
4116- auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t > & values) {
4117- ggml_tensor * cur = get_inp_tensor (name);
4118- GGML_ASSERT (cur->type == GGML_TYPE_I32);
4119- GGML_ASSERT (ggml_nelements (cur) == (int64_t )values.size ());
4120- ggml_backend_tensor_set (cur, values.data (), 0 , ggml_nbytes (cur));
4121- };
4122-
41234114 {
41244115 struct ggml_tensor * embeddings = ggml_graph_get_tensor (gf, " embeddings" );
41254116 ggml_backend_tensor_set (embeddings, vit_embedding, 0 , ggml_nbytes (embeddings));
@@ -4674,8 +4665,8 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
46744665 batch->is_audio = true ;
46754666}
46764667
4677- void clip_set_ane_model_path (struct clip_ctx * ctx, const char * ane_model_path ) {
4678- if (ctx && ane_model_path ) {
4679- ctx->ane_model_path = ane_model_path ;
4668+ void clip_set_coreml_model_path (struct clip_ctx * ctx, const char * coreml_model_path ) {
4669+ if (ctx && coreml_model_path ) {
4670+ ctx->coreml_model_path = coreml_model_path ;
46804671 }
46814672}
0 commit comments