ggml-org · tc-mb · Jul 7, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 16, 2025
@@ -8,7 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
 
 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_EXAMPLES=OFF
-LLAMA_BUILD_TOOLS=OFF
+LLAMA_BUILD_TOOLS=ON
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
 GGML_METAL=ON
@@ -124,6 +124,10 @@ setup_framework_structure() {
     cp ggml/include/ggml-cpu.h     ${header_path}
     cp ggml/include/ggml-blas.h    ${header_path}
     cp ggml/include/gguf.h         ${header_path}
+    # Copy mtmd-ios headers and dependencies
+    cp tools/mtmd/mtmd-ios.h        ${header_path}
+    cp tools/mtmd/mtmd.h            ${header_path}
+    cp tools/mtmd/mtmd-helper.h     ${header_path}
 
     # Create module map (common for all platforms)
     cat > ${module_path}module.modulemap << EOF
@@ -136,6 +140,9 @@ framework module llama {
     header "ggml-cpu.h"
     header "ggml-blas.h"
     header "gguf.h"
+    header "mtmd-ios.h"
+    header "mtmd.h"
+    header "mtmd-helper.h"
 
     link "c++"
     link framework "Accelerate"
@@ -252,6 +259,8 @@ combine_static_libraries() {
         "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
         "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
         "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
+        "${base_dir}/${build_dir}/common/${release_dir}/libcommon.a"
+        "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
     )
 
     # Create temporary directory for processing
@@ -327,7 +336,7 @@ combine_static_libraries() {
         $arch_flags \
         $min_version_flag \
         -Wl,-force_load,"${temp_dir}/combined.a" \
-        -framework Foundation -framework Metal -framework Accelerate \
+        -framework Foundation -framework Metal -framework Accelerate -framework CoreML \
         -install_name "$install_name" \
         -o "${base_dir}/${output_lib}"
 

@@ -1178,6 +1178,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
                 common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
+                common_params_handle_model(params.coreml,    params.hf_token, "", params.offline);
                 break;
             }
         }
@@ -2522,6 +2523,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.mmproj_use_gpu = false;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    // CoreML model path (new)
+    add_opt(common_arg(
+        {"--coreml"}, "FILE",
+        "path to CoreML model file",
+        [](common_params & params, const std::string & value) {
+            params.coreml.path = value;
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_COREML"));
+
     add_opt(common_arg(
         {"--image", "--audio"}, "FILE",
         "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",

@@ -405,6 +405,9 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+
+    // Apple Neural Engine support
+    struct common_params_model coreml;
 
     // finetune
     struct lr_opt lr;

@@ -2,6 +2,9 @@
 
 find_package(Threads REQUIRED)
 
+# CoreML support option
+option(ENABLE_COREML "Enable CoreML support" OFF)
+
 add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
@@ -13,13 +16,43 @@ add_library(mtmd
             mtmd-helper.h
             )
 
-target_link_libraries     (mtmd PUBLIC ggml llama)
+# Add CoreML related files when enabled
+if(ENABLE_COREML)
+    target_sources(mtmd PRIVATE
+        coreml/mtmd_coreml.h
+        coreml/mtmd_coreml.mm
+        coreml/coreml_minicpmv40_vit_f16.h
+        coreml/coreml_minicpmv40_vit_f16.m
+    )
+    # Define compile-time macro for code guards
+    target_compile_definitions(mtmd PRIVATE ENABLE_COREML)
+
+    # Enable ARC for Objective-C files
+    set_source_files_properties(coreml/mtmd_coreml.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+    set_source_files_properties(coreml/coreml_minicpmv40_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+endif()
+
+target_link_libraries     (mtmd PUBLIC ggml llama common)
 target_link_libraries     (mtmd PRIVATE Threads::Threads)
 target_include_directories(mtmd PUBLIC  .)
 target_include_directories(mtmd PRIVATE ../..)
+target_include_directories(mtmd PRIVATE ../../common)
+target_include_directories(mtmd PRIVATE ../../include)
+target_include_directories(mtmd PRIVATE ../../ggml/include)
+target_include_directories(mtmd PRIVATE ../../src)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
+# Link CoreML and Accelerate frameworks when CoreML is enabled
+if(ENABLE_COREML)
+    target_link_libraries(mtmd PRIVATE
+        "-framework Foundation"
+        "-framework CoreML"
+        "-framework Accelerate"
+        "-ObjC"
+    )
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -31,6 +64,13 @@ set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
     )
 
+# Add CoreML public headers when enabled
+if(ENABLE_COREML)
+    list(APPEND MTMD_PUBLIC_HEADERS
+        ${CMAKE_CURRENT_SOURCE_DIR}/coreml/coreml.h
+    )
+endif()
+
 set_target_properties(mtmd
     PROPERTIES
     PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")

@@ -10,6 +10,9 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
+#if defined(ENABLE_COREML)
+#include "coreml/mtmd_coreml.h"
+#endif
 
 #include <cassert>
 #include <cmath>
@@ -390,6 +393,9 @@ struct clip_ctx {
     bool debug_graph = false;
     std::vector<ggml_tensor *> debug_print_tensors;
 
+    // CoreML model path for iOS
+    std::string coreml_model_path;
+
     clip_ctx(clip_context_params & ctx_params) {
         debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
         backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
@@ -3930,15 +3936,146 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
     return pos_embed_2d;
 }
 
+#if defined(ENABLE_COREML)
+static bool clip_image_encode_coreml(float * pixel_values, int32_t * position_ids, float * pos_embed, float * vec, const char* coreml_model_path) {
+
+    static int flag = 0;
+    static const void* coremlEncoder = NULL;
+    static std::string cached_model_path = "";
+
+    // Check if we need to load a new model
+    if (flag == 0 || (coreml_model_path && cached_model_path != coreml_model_path)) {
+        if (coremlEncoder) {
+            closeModel(coremlEncoder);
+        }
+        coremlEncoder = loadModel(coreml_model_path);
+        if (!coremlEncoder) {
+            printf("Failed to load CoreML model from: %s\n", coreml_model_path ? coreml_model_path : "null");
+            return false;
+        }
+        cached_model_path = coreml_model_path ? coreml_model_path : "";
+        flag = 1;
+    }
+    predictWith(coremlEncoder, pixel_values, position_ids, pos_embed, vec);
+    return true;
+}
+#endif
+
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     clip_image_f32_batch imgs;
     clip_image_f32_ptr img_copy(clip_image_f32_init());
     *img_copy = *img;
     imgs.entries.push_back(std::move(img_copy));
 
+#if defined(ENABLE_COREML)
+    const bool can_use_coreml =
+        !ctx->coreml_model_path.empty() &&
+        ctx->model.modality == CLIP_MODALITY_VISION &&
+        ctx->proj_type() == PROJECTOR_TYPE_MINICPMV;
+    if (can_use_coreml){
+        printf("clip use coreml\n");
+        return clip_image_batch_encode_coreml(ctx, &imgs, vec);
+    }
+#endif
     return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
 
+bool clip_image_batch_encode_coreml(clip_ctx * ctx, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+
+    const clip_image_f32_batch & imgs = *imgs_c_ptr;
+    int batch_size = imgs.entries.size();
+
+    if (batch_size != 1) {
+        return false; // only support batch size of 1
+    }
+     // set inputs
+     const auto & model   = ctx->model;
+     const auto & hparams = model.hparams;
+
+     const int image_size_width  = imgs.entries[0]->nx;
+     const int image_size_height = imgs.entries[0]->ny;
+     const int patch_size    = hparams.patch_size;
+     const int pos_w = image_size_width  / patch_size;
+     const int pos_h = image_size_height / patch_size;
+
+    switch (ctx->model.proj_type) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                std::vector<float> inp_raw;
+                std::vector<int32_t> positions;
+                std::vector<float> pos_embed;
+
+                // prepare inp_raw
+                {
+                    const int max_patches = 1024;
+                    const int nx = max_patches * patch_size;
+                    const int ny = patch_size;
+                    const int n = nx * ny;
+                    inp_raw.assign(3 * n, 0.0f);
+
+                    int patch_index = 0;
+                    for (int i = 0; i < image_size_height && patch_index < max_patches; i += patch_size) {
+                        for (int j = 0; j < image_size_width && patch_index < max_patches; j += patch_size) {
+                            for (int pi = 0; pi < patch_size; ++pi) {
+                                for (int pj = 0; pj < patch_size; ++pj) {
+                                    int src_index = ((i + pi) * image_size_width + (j + pj)) * 3;
+                                    int dst_index = nx * pi + patch_index * patch_size + pj;
+                                    inp_raw[dst_index]         = imgs.entries[0]->buf[src_index];
+                                    inp_raw[n + dst_index]     = imgs.entries[0]->buf[src_index + 1];
+                                    inp_raw[2 * n + dst_index] = imgs.entries[0]->buf[src_index + 2];
+                                }
+                            }
+                            patch_index++;
+                        }
+                    }
+                }
+                // prepare position_ids
+                {
+                    // inspired from siglip:
+                    //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
+                    //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
+                    positions.assign(std::max(pos_h * pos_w, 1024),0);
+                    int bucket_coords_h[1024];
+                    int bucket_coords_w[1024];
+                    for (int i = 0; i < pos_h; i++){
+                        bucket_coords_h[i] = std::floor(70.0*i/pos_h);
+                    }
+                    for (int i = 0; i < pos_w; i++){
+                        bucket_coords_w[i] = std::floor(70.0*i/pos_w);
+                    }
+                    for (int i = 0, id = 0; i < pos_h; i++){
+                        for (int j = 0; j < pos_w; j++){
+                            positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+                        }
+                    }
+                }
+                // prepare pos_embed
+                {
+                    // inspired from resampler of Qwen-VL:
+                    //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
+                    //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
+                    int embed_dim = clip_n_mmproj_embd(ctx);
+
+                    // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
+                    auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
+
+                    pos_embed.assign(embed_dim * std::max(pos_w * pos_h, 1024), 0.0f);
+                    for(int i = 0; i < pos_w * pos_h; ++i){
+                        for(int j = 0; j < embed_dim; ++j){
+                            pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
+                        }
+                    }
+                }
+
+#if defined(ENABLE_COREML)
+                return clip_image_encode_coreml(inp_raw.data(), positions.data(), pos_embed.data(), vec, ctx->coreml_model_path.c_str());
+#endif
+            }
+        default:
+            GGML_ABORT("Unknown projector type");
+    }
+}
+
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
@@ -4421,3 +4558,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
     batch->entries.push_back(clip_image_f32_ptr(audio));
     batch->is_audio = true;
 }
+
+void clip_set_coreml_model_path(struct clip_ctx * ctx, const char * coreml_model_path) {
+    if (ctx && coreml_model_path) {
+        ctx->coreml_model_path = coreml_model_path;
+    }
+}
@@ -89,6 +89,7 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+bool clip_image_batch_encode_coreml(struct clip_ctx * ctx, const struct clip_image_f32_batch * imgs, float * vec);
 
 int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
@@ -104,3 +105,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
+
+// CoreML support functions
+void clip_set_coreml_model_path(struct clip_ctx * ctx, const char * coreml_model_path);