Replace ane with coreml; replace malloc with std::vector<float>

tc-mb · tc-mb · commit fd64e45d6eff · 2025-08-14T18:18:32.000+08:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -961,7 +961,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
                 common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
-                common_params_handle_model(params.ane,       params.hf_token, "", params.offline);
+                common_params_handle_model(params.coreml,    params.hf_token, "", params.offline);
                 break;
             }
         }
@@ -2264,13 +2264,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.mmproj_use_gpu = false;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    // CoreML model path (new)
     add_opt(common_arg(
-        {"--ane"}, "FILE",
-        "path to Apple Neural Engine model file for iOS",
+        {"--coreml"}, "FILE",
+        "path to CoreML model file",
         [](common_params & params, const std::string & value) {
-            params.ane.path = value;
+            params.coreml.path = value;
         }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_ANE"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_COREML"));
+
     add_opt(common_arg(
         {"--image", "--audio"}, "FILE",
         "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
diff --git a/common/common.h b/common/common.h
@@ -377,7 +377,7 @@ struct common_params {
     std::vector<std::string> image; // path to image file(s)
     
     // Apple Neural Engine support
-    struct common_params_model ane;
+    struct common_params_model coreml;
 
     // embedding
     bool embedding         = false; // get only sentence embedding
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
@@ -2,8 +2,8 @@
 
 find_package(Threads REQUIRED)
 
-# ANE support option
-option(ENABLE_ANE "Enable Apple Neural Engine support" OFF)
+# CoreML support option
+option(ENABLE_COREML "Enable CoreML support" OFF)
 
 add_library(mtmd
             mtmd.cpp
@@ -16,20 +16,20 @@ add_library(mtmd
             mtmd-helper.h
             )
 
-# Add ANE related files when enabled
-if(ENABLE_ANE)
+# Add CoreML related files when enabled
+if(ENABLE_COREML)
     target_sources(mtmd PRIVATE
-        ane/ane.h
-        ane/ane.mm
-        ane/ane_minicpmv4_vit_f16.h
-        ane/ane_minicpmv4_vit_f16.m
+        coreml/mtmd_coreml.h
+        coreml/mtmd_coreml.mm
+        coreml/ane_minicpmv4_vit_f16.h
+        coreml/ane_minicpmv4_vit_f16.m
     )
     # Define compile-time macro for code guards
-    target_compile_definitions(mtmd PRIVATE ENABLE_ANE)
+    target_compile_definitions(mtmd PRIVATE ENABLE_COREML)
     
     # Enable ARC for Objective-C files
-    set_source_files_properties(ane/ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
-    set_source_files_properties(ane/ane_minicpmv4_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+    set_source_files_properties(coreml/mtmd_coreml.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+    set_source_files_properties(coreml/ane_minicpmv4_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
 endif()
 
 target_link_libraries     (mtmd PUBLIC ggml llama common)
@@ -43,8 +43,8 @@ target_include_directories(mtmd PRIVATE ../../src)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
-# Link CoreML and Accelerate frameworks when ANE is enabled
-if(ENABLE_ANE)
+# Link CoreML and Accelerate frameworks when CoreML is enabled
+if(ENABLE_COREML)
     target_link_libraries(mtmd PRIVATE 
         "-framework Foundation" 
         "-framework CoreML" 
@@ -64,10 +64,10 @@ set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
     )
 
-# Add ANE public headers when enabled
-if(ENABLE_ANE)
+# Add CoreML public headers when enabled
+if(ENABLE_COREML)
     list(APPEND MTMD_PUBLIC_HEADERS
-        ${CMAKE_CURRENT_SOURCE_DIR}/ane/ane.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/coreml/coreml.h
     )
 endif()
 
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -10,8 +10,8 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
-#if defined(ENABLE_ANE)
-#include "ane/ane.h"
+#if defined(ENABLE_COREML)
+#include "coreml/mtmd_coreml.h"
 #endif
 
 #include <cassert>
@@ -392,8 +392,8 @@ struct clip_ctx {
     bool debug_graph = false;
     std::vector<ggml_tensor *> debug_print_tensors;
     
-    // ANE model path for iOS
-    std::string ane_model_path;
+    // CoreML model path for iOS
+    std::string coreml_model_path;
 
     clip_ctx(clip_context_params & ctx_params) {
         debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
@@ -914,8 +914,6 @@ struct clip_graph {
     }
 
     ggml_cgraph * build_minicpmv_embedding() {
-        const int batch_size = 1;
-
         GGML_ASSERT(model.class_embedding == nullptr);
         const int n_pos = n_patches;
 
@@ -3840,24 +3838,28 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
     return pos_embed_2d;
 }
 
-#if defined(ENABLE_ANE)
-static bool clip_image_encode_ane(float * data, float * vec, const char* ane_model_path) {
+#if defined(ENABLE_COREML)
+// forward declarations
+static bool coreml_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec);
+static bool coreml_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec);
+
+static bool clip_image_encode_coreml(float * data, float * vec, const char* coreml_model_path) {
 
     static int flag = 0;
     static const void* coremlEncoder = NULL;
     static std::string cached_model_path = "";
     
     // Check if we need to load a new model
-    if (flag == 0 || (ane_model_path && cached_model_path != ane_model_path)) {
+    if (flag == 0 || (coreml_model_path && cached_model_path != coreml_model_path)) {
         if (coremlEncoder) {
             closeModel(coremlEncoder);
         }
-        coremlEncoder = loadModel(ane_model_path);
+        coremlEncoder = loadModel(coreml_model_path);
         if (!coremlEncoder) {
-            printf("Failed to load ANE model from: %s\n", ane_model_path ? ane_model_path : "null");
+            printf("Failed to load CoreML model from: %s\n", coreml_model_path ? coreml_model_path : "null");
             return false;
         }
-        cached_model_path = ane_model_path ? ane_model_path : "";
+        cached_model_path = coreml_model_path ? coreml_model_path : "";
         flag = 1;
     }
     predictWith(coremlEncoder, data, vec);
@@ -3871,27 +3873,30 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
     *img_copy = *img;
     imgs.entries.push_back(std::move(img_copy));
 
-#if defined(ENABLE_ANE)
+#if defined(ENABLE_COREML)
     bool ios_ctx = true;
     if (ios_ctx){
-        printf("clip use ane\n");
-        float * vit_embedding1 = (float *)malloc(1100*1152*sizeof(float));
-        float * vit_embedding2 = (float *)malloc(1100*1152*sizeof(float));
-
-        ane_embedding(ctx, n_threads, &imgs, vit_embedding1);
-        clip_image_encode_ane(vit_embedding1, vit_embedding2, ctx->ane_model_path.c_str());
-        ane_resampler(ctx, n_threads, &imgs, vit_embedding2, vec);
-        free(vit_embedding1);
-        free(vit_embedding2);
+        printf("clip use coreml\n");
+        std::vector<float> vit_embedding1(1100*1152);
+        std::vector<float> vit_embedding2(1100*1152);
+
+        // call CoreML pipeline: embedding -> encoder -> resampler
+        if (!coreml_embedding(ctx, n_threads, &imgs, vit_embedding1.data())) {
+            return false;
+        }
+        clip_image_encode_coreml(vit_embedding1.data(), vit_embedding2.data(), ctx->coreml_model_path.c_str());
+        if (!coreml_resampler(ctx, n_threads, &imgs, vit_embedding2.data(), vec)) {
+            return false;
+        }
         return true;
     }
 #endif
 
     return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
 
-#if defined(ENABLE_ANE)
-static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+#if defined(ENABLE_COREML)
+static bool coreml_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
 
@@ -3908,7 +3913,7 @@ static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_
     clip_graph graph(ctx, *imgs.entries[0]);
     ggml_cgraph * gf;
     gf = graph.build_minicpmv_embedding();
-    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
+        ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
 
     // set inputs
     const auto & model   = ctx->model;
@@ -3918,8 +3923,6 @@ static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_
     const int image_size_height = imgs.entries[0]->ny;
 
     const int patch_size    = hparams.patch_size;
-    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
     const int pos_w = image_size_width  / patch_size;
     const int pos_h = image_size_height / patch_size;
 
@@ -4054,16 +4057,13 @@ static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_
     // the last node is the embedding tensor
     ggml_tensor * embeddings = ggml_graph_node(gf, -1);
 
-    // sanity check (only support batch size of 1 for now)
-    const int n_tokens_out = embeddings->ne[1];
-
     // copy the embeddings to the location passed by the user
     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
 
     return true;
 }
 
-static bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) {
+static bool coreml_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
 
@@ -4090,8 +4090,6 @@ static bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_
     const int image_size_height = imgs.entries[0]->ny;
 
     const int patch_size    = hparams.patch_size;
-    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
     const int pos_w = image_size_width  / patch_size;
     const int pos_h = image_size_height / patch_size;
 
@@ -4113,13 +4111,6 @@ static bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_
         ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
     };
 
-    auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
-        ggml_tensor * cur = get_inp_tensor(name);
-        GGML_ASSERT(cur->type == GGML_TYPE_I32);
-        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
-        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
-    };
-
     {
         struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
         ggml_backend_tensor_set(embeddings, vit_embedding, 0, ggml_nbytes(embeddings));
@@ -4674,8 +4665,8 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
     batch->is_audio = true;
 }
 
-void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path) {
-    if (ctx && ane_model_path) {
-        ctx->ane_model_path = ane_model_path;
+void clip_set_coreml_model_path(struct clip_ctx * ctx, const char * coreml_model_path) {
+    if (ctx && coreml_model_path) {
+        ctx->coreml_model_path = coreml_model_path;
     }
 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -110,5 +110,5 @@ bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
 
-// ANE support functions
-void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path);
+// CoreML support functions
+void clip_set_coreml_model_path(struct clip_ctx * ctx, const char * coreml_model_path);
diff --git a/tools/mtmd/coreml/ane_minicpmv4_vit_f16.h b/tools/mtmd/coreml/ane_minicpmv4_vit_f16.h
diff --git a/tools/mtmd/coreml/ane_minicpmv4_vit_f16.m b/tools/mtmd/coreml/ane_minicpmv4_vit_f16.m
diff --git a/tools/mtmd/coreml/mtmd_coreml.h b/tools/mtmd/coreml/mtmd_coreml.h
diff --git a/tools/mtmd/coreml/mtmd_coreml.mm b/tools/mtmd/coreml/mtmd_coreml.mm
@@ -1,6 +1,6 @@
 #import <CoreML/CoreML.h>
 #import <Accelerate/Accelerate.h>
-#import "ane.h"
+#import "mtmd_coreml.h"
 #import "ane_minicpmv4_vit_f16.h"
 #include <stdlib.h>
 
@@ -19,36 +19,36 @@
     // Check if file exists
     NSFileManager *fileManager = [NSFileManager defaultManager];
     if (![fileManager fileExistsAtPath:pathString]) {
-        NSLog(@"Error: ANE model file does not exist at path: %@", pathString);
+        NSLog(@"Error: CoreML model file does not exist at path: %@", pathString);
         return nullptr;
     }
     
     // Check if it's a directory (for .mlmodelc packages)
     BOOL isDirectory;
     if ([fileManager fileExistsAtPath:pathString isDirectory:&isDirectory]) {
         if (!isDirectory && ![pathString hasSuffix:@".mlmodelc"]) {
-            NSLog(@"Warning: ANE model path should typically be a .mlmodelc directory: %@", pathString);
+            NSLog(@"Warning: CoreML model path should typically be a .mlmodelc directory: %@", pathString);
         }
     }
     
     NSURL *modelURL = [NSURL fileURLWithPath:pathString];
     
-    NSLog(@"Loading ANE model from: %@", modelURL.absoluteString);
+    NSLog(@"Loading CoreML model from: %@", modelURL.absoluteString);
     
     NSError *error = nil;
     const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:&error]);
     
     if (error) {
-        NSLog(@"Error loading ANE model: %@", error.localizedDescription);
+        NSLog(@"Error loading CoreML model: %@", error.localizedDescription);
         return nullptr;
     }
     
     if (!model) {
-        NSLog(@"Error: Failed to create ANE model instance");
+        NSLog(@"Error: Failed to create CoreML model instance");
         return nullptr;
     }
     
-    NSLog(@"Successfully loaded ANE model from: %@", pathString);
+    NSLog(@"Successfully loaded CoreML model from: %@", pathString);
     return model;
 }
 
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
@@ -132,7 +132,7 @@ struct mtmd_cli_context {
         mparams.print_timings = true;
         mparams.n_threads = params.cpuparams.n_threads;
         mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
-        mparams.ane_model_path = params.ane.path.empty() ? nullptr : params.ane.path.c_str();
+        mparams.coreml_model_path = params.coreml.path.empty() ? nullptr : params.coreml.path.c_str();
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -92,7 +92,7 @@ mtmd_context_params mtmd_context_params_default() {
     params.verbosity = GGML_LOG_LEVEL_INFO;
     params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
     params.media_marker = mtmd_default_marker();
-    params.ane_model_path = nullptr;
+    params.coreml_model_path = nullptr;
     return params;
 }
 
@@ -158,17 +158,17 @@ struct mtmd_context {
         ctx_v = res.ctx_v;
         ctx_a = res.ctx_a;
         
-        // Set ANE model path for iOS
-        if (ctx_params.ane_model_path && ctx_v) {
-            // Check if ANE model file exists
-            std::ifstream ane_file(ctx_params.ane_model_path);
-            if (!ane_file.good()) {
-                throw std::runtime_error(string_format("ANE model file does not exist: %s", ctx_params.ane_model_path));
+        // Set CoreML model path for iOS
+        if (ctx_params.coreml_model_path && ctx_v) {
+            // Check if CoreML model file exists
+            std::ifstream coreml_file(ctx_params.coreml_model_path);
+            if (!coreml_file.good()) {
+                throw std::runtime_error(string_format("CoreML model file does not exist: %s", ctx_params.coreml_model_path));
             }
-            ane_file.close();
+            coreml_file.close();
             
-            clip_set_ane_model_path(ctx_v, ctx_params.ane_model_path);
-            LOG_INF("ANE model path set to: %s\n", ctx_params.ane_model_path);
+            clip_set_coreml_model_path(ctx_v, ctx_params.coreml_model_path);
+            LOG_INF("CoreML model path set to: %s\n", ctx_params.coreml_model_path);
         }
         if (!ctx_v && !ctx_a) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
@@ -82,7 +82,7 @@ struct mtmd_context_params {
     enum ggml_log_level verbosity;
     const char * image_marker; // deprecated, use media_marker instead
     const char * media_marker;
-    const char * ane_model_path; // path to ANE model for iOS
+    const char * coreml_model_path; // path to CoreML model for iOS
 };
 
 MTMD_API const char * mtmd_default_marker(void);