From f37f8a95de4d1bf6e77ccea2785fe32386bad6df Mon Sep 17 00:00:00 2001
From: tc_mb <caitianchi@modelbest.cn>
Date: Mon, 7 Jul 2025 14:58:15 +0800
Subject: [PATCH 01/15] support minicpm-v 4

---
 tools/mtmd/clip.cpp                            | 18 ++++++++++++++++++
 .../minicpmv-convert-image-encoder-to-gguf.py  | 17 ++++++++++++-----
 tools/mtmd/mtmd.cpp                            |  2 +-
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9146c9e9c4481..698c38ba2c4ff 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -849,10 +849,16 @@ struct clip_graph {
             int n_head = n_embd/d_head;
             int num_query = 96;
             if (ctx->model.hparams.minicpmv_version == 2) {
+                // MiniCPM-V 2.5
                 num_query = 96;
             } else if (ctx->model.hparams.minicpmv_version == 3) {
+                // MiniCPM-V 2.6
                 num_query = 64;
             } else if (ctx->model.hparams.minicpmv_version == 4) {
+                // MiniCPM-o 2.6
+                num_query = 64;
+            } else if (ctx->model.hparams.minicpmv_version == 5) {
+                // MiniCPM-V 4.0
                 num_query = 64;
             }
 
@@ -3518,10 +3524,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_MINICPMV:
             {
                 if (params.minicpmv_version == 2) {
+                    // MiniCPM-V 2.5
                     n_patches_sq = 96;
                 } else if (params.minicpmv_version == 3) {
+                    // MiniCPM-V 2.6
                     n_patches_sq = 64;
                 } else if (params.minicpmv_version == 4) {
+                    // MiniCPM-o 2.6
+                    n_patches_sq = 64;
+                } else if (params.minicpmv_version == 5) {
+                    // MiniCPM-V 4.0
                     n_patches_sq = 64;
                 } else {
                     GGML_ABORT("Unknown minicpmv version");
@@ -4060,11 +4072,17 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return ctx->model.mm_3_b->ne[0];
         case PROJECTOR_TYPE_MINICPMV:
             if (hparams.minicpmv_version == 2) {
+                // MiniCPM-V 2.5
                 return 4096;
             } else if (hparams.minicpmv_version == 3) {
+                // MiniCPM-V 2.6
                 return 3584;
             } else if (hparams.minicpmv_version == 4) {
+                // MiniCPM-o 2.6
                 return 3584;
+            } else if (hparams.minicpmv_version == 5) {
+                // MiniCPM-V 4.0
+                return 2560;
             }
             GGML_ABORT("Unknown minicpmv version");
         case PROJECTOR_TYPE_GLM_EDGE:
diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
index cfe0961f9891a..ff01bbdc01981 100644
--- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
+++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -546,19 +546,22 @@ def bytes_to_unicode():
 minicpmv_version = args.minicpmv_version
 emb_dim = 4096
 block_count = 26
-if minicpmv_version == 1:
+if minicpmv_version == 1:  # MiniCPM-V 2.0
     emb_dim = 2304
     block_count = 26
-elif minicpmv_version == 2:
+elif minicpmv_version == 2:  # MiniCPM-V 2.5
     emb_dim = 4096
     block_count = 27
-elif minicpmv_version == 3:
+elif minicpmv_version == 3:  # MiniCPM-V 2.6
     emb_dim = 3584
     block_count = 27
-elif minicpmv_version == 4:
+elif minicpmv_version == 4:  # MiniCPM-o 2.6
     emb_dim = 3584
     block_count = 27
-
+elif minicpmv_version == 5:  # MiniCPM-V 4.0
+    emb_dim = 2560
+    block_count = 27
+    
 default_vision_config = {
         "hidden_size": 1152,
         "image_size": 980,
@@ -577,6 +580,10 @@ def bytes_to_unicode():
 elif minicpmv_version == 4:
     vision_config = SiglipVisionConfig(**default_vision_config)
     model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 5:
+    default_vision_config["model_type"] = "siglip_vision_model"
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
 
 processor = None
 # if model.attn_pool is not None:
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index e3829738338c3..66553f838bd86 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -207,7 +207,7 @@ struct mtmd_context {
             tok_row_end_trail = false; // no trailing end-of-row token
             ov_img_first      = true;
 
-        } else if (minicpmv_version == 3 || minicpmv_version == 4) {
+        } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) {
             // minicpmv 2.6 format:
             // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;

From 220ad75fa369e42b2cc98c2da4bd8f1128457311 Mon Sep 17 00:00:00 2001
From: tc_mb <caitianchi@modelbest.cn>
Date: Thu, 10 Jul 2025 15:57:11 +0800
Subject: [PATCH 02/15] ane test

---
 tools/mtmd/CMakeLists.txt |  31 +++
 tools/mtmd/ane.h          |  11 +
 tools/mtmd/ane.mm         |  49 +++++
 tools/mtmd/clip.cpp       | 451 ++++++++++++++++++++++++++++++++++++++
 tools/mtmd/clip.h         |   3 +
 5 files changed, 545 insertions(+)
 create mode 100644 tools/mtmd/ane.h
 create mode 100644 tools/mtmd/ane.mm

diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 4baa15b9609fc..e0e257ed9fd3d 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -13,6 +13,20 @@ add_library(mtmd
             mtmd-helper.h
             )
 
+# 在Apple平台上添加ANE相关文件
+if(APPLE)
+    target_sources(mtmd PRIVATE
+        ane.h
+        ane.mm
+        ane_minicpm4v3b_vision_f16_b1.h
+        ane_minicpm4v3b_vision_f16_b1.m
+    )
+    
+    # 为Objective-C文件启用ARC
+    set_source_files_properties(ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+    set_source_files_properties(ane_minicpm4v3b_vision_f16_b1.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+endif()
+
 target_link_libraries     (mtmd PUBLIC ggml llama)
 target_link_libraries     (mtmd PRIVATE Threads::Threads)
 target_include_directories(mtmd PUBLIC  .)
@@ -20,6 +34,16 @@ target_include_directories(mtmd PRIVATE ../..)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
+# 在Apple平台上链接CoreML和Accelerate框架
+if(APPLE)
+    target_link_libraries(mtmd PRIVATE 
+        "-framework Foundation" 
+        "-framework CoreML" 
+        "-framework Accelerate"
+        "-ObjC"
+    )
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -31,6 +55,13 @@ set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
     )
 
+# 在Apple平台上添加ANE公共头文件
+if(APPLE)
+    list(APPEND MTMD_PUBLIC_HEADERS
+        ${CMAKE_CURRENT_SOURCE_DIR}/ane.h
+    )
+endif()
+
 set_target_properties(mtmd
     PROPERTIES
     PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
diff --git a/tools/mtmd/ane.h b/tools/mtmd/ane.h
new file mode 100644
index 0000000000000..7fe7ad5c9a347
--- /dev/null
+++ b/tools/mtmd/ane.h
@@ -0,0 +1,11 @@
+#if __cplusplus
+extern "C" {
+#endif
+
+const void* loadModel();
+void closeModel(const void* model);
+void predictWith(const void* model, float* embed, float* encoderOutput);
+
+#if __cplusplus
+}   // Extern C
+#endif
diff --git a/tools/mtmd/ane.mm b/tools/mtmd/ane.mm
new file mode 100644
index 0000000000000..8f92641068df0
--- /dev/null
+++ b/tools/mtmd/ane.mm
@@ -0,0 +1,49 @@
+#import <CoreML/CoreML.h>
+#import <Accelerate/Accelerate.h>
+#import "ane.h"
+#import "ane_minicpm4v3b_vision_f16_b1.h"
+#include <stdlib.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+const void* loadModel() {
+    // 新的，从 documents directionary 中加载 begin
+    // 获取文件管理器实例
+    NSFileManager *fileManager = [NSFileManager defaultManager];
+    // 获取应用的 Documents 目录的 URL
+    NSURL *documentsURL = [[fileManager URLsForDirectory:NSDocumentDirectory inDomains:NSUserDomainMask] firstObject];
+    NSString *pathString = [documentsURL.absoluteString stringByAppendingString:@"ane_minicpm4v3b_vision_f16_b1.mlmodelc"];
+    NSURL *modelURL = [NSURL URLWithString:pathString];
+
+    NSLog(modelURL.absoluteString);
+
+    const void* model = CFBridgingRetain([[ane_minicpm4v3b_vision_f16_b1 alloc] initWithContentsOfURL:modelURL error:nil]);
+    return model;
+}
+
+void predictWith(const void* model, float* embed, float* encoderOutput) {
+    MLMultiArray *inMultiArray = [[MLMultiArray alloc] initWithDataPointer: embed
+                                                                      shape: @[@1, @1024, @1152]
+                                                                   dataType: MLMultiArrayDataTypeFloat32
+                                                                    strides: @[@(1179648), @(1152), @1]
+                                                                deallocator: nil
+                                                                      error: nil];
+
+    ane_minicpm4v3b_vision_f16_b1Output *modelOutput = [(__bridge id)model predictionFromInput:inMultiArray error:nil];
+
+    MLMultiArray *outMA = modelOutput.output;
+
+    cblas_scopy((int)outMA.count,
+                (float*)outMA.dataPointer, 1,
+                encoderOutput, 1);
+}
+
+void closeModel(const void* model) {
+    CFRelease(model);
+}
+
+#if __cplusplus
+} //Extern C
+#endif
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 698c38ba2c4ff..502a6582010c0 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -10,6 +10,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
+#include "ane.h"
 
 #include <cassert>
 #include <cmath>
@@ -898,6 +899,126 @@ struct clip_graph {
         return gf;
     }
 
+    ggml_cgraph * build_minicpmv_embedding() {
+        const int batch_size = 1;
+
+        GGML_ASSERT(model.class_embedding == nullptr);
+        const int n_pos = n_patches;
+
+        // for selecting learned pos embd, used by ViT
+        struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(positions, "positions");
+        ggml_set_input(positions);
+
+        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+
+        ggml_tensor * inp = build_inp();
+        if (learned_pos_embd) {
+            inp = ggml_add(ctx0, inp, learned_pos_embd);
+            cb(inp, "pos_embed", -1);
+        }
+        ggml_tensor * embeddings = inp;
+
+        // pre-layernorm
+        if (model.pre_ln_w) {
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            ggml_set_name(embeddings, "pre_ln");
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
+        }
+
+        ggml_build_forward_expand(gf, embeddings);
+        return gf;
+    }
+
+    ggml_cgraph * build_minicpmv_resampler() {
+        const int batch_size = 1;
+
+        GGML_ASSERT(model.class_embedding == nullptr);
+        const int n_pos = n_patches;
+        
+        const int image_size_width  = img.nx;
+        const int image_size_height = img.ny;
+        const int patch_size    = hparams.patch_size;
+        const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+
+        // position embeddings for the projector (not for ViT)
+        int n_output_dim = clip_n_mmproj_embd(ctx);
+        ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
+        ggml_set_name(pos_embed, "pos_embed");
+        ggml_set_input(pos_embed);
+
+        struct ggml_tensor * embeddings = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1152, num_patches);
+        ggml_set_name(embeddings, "embeddings");
+        ggml_set_input(embeddings);
+
+        // resampler projector (it is just another transformer)
+
+        ggml_tensor * q = model.mm_model_query;
+        ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+
+        // norm
+        q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
+        v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
+
+        // k = v + pos_embed
+        ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
+
+        // attention
+        {
+            int n_embd = clip_n_mmproj_embd(ctx);
+            const int d_head = 128;
+            int n_head = n_embd/d_head;
+            int num_query = 96;
+            if (ctx->model.hparams.minicpmv_version == 2) {
+                // MiniCPM-V 2.5
+                num_query = 96;
+            } else if (ctx->model.hparams.minicpmv_version == 3) {
+                // MiniCPM-V 2.6
+                num_query = 64;
+            } else if (ctx->model.hparams.minicpmv_version == 4) {
+                // MiniCPM-o 2.6
+                num_query = 64;
+            } else if (ctx->model.hparams.minicpmv_version == 5) {
+                // MiniCPM-V 4.0
+                num_query = 64;
+            }
+
+            ggml_tensor * Q = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
+                model.mm_model_attn_q_b);
+            ggml_tensor * K = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
+                model.mm_model_attn_k_b);
+            ggml_tensor * V = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
+                model.mm_model_attn_v_b);
+
+            Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
+            K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
+            V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
+
+            cb(Q, "resampler_Q", -1);
+            cb(K, "resampler_K", -1);
+            cb(V, "resampler_V", -1);
+
+            embeddings = build_attn(
+                model.mm_model_attn_o_w,
+                model.mm_model_attn_o_b,
+                Q, K, V, nullptr, kq_scale, -1);
+            cb(embeddings, "resampler_attn_out", -1);
+        }
+        // layernorm
+        embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
+
+        // projection
+        embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+
+        // build the graph
+        ggml_build_forward_expand(gf, embeddings);
+
+        return gf;
+    }
+
     ggml_cgraph * build_internvl() {
         GGML_ASSERT(model.class_embedding != nullptr);
         GGML_ASSERT(model.position_embeddings != nullptr);
@@ -3678,15 +3799,345 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
     return pos_embed_2d;
 }
 
+static bool clip_image_encode_ane(float * data, float * vec) {
+
+    static int flag = 0;
+    static const void* coremlEncoder = NULL;
+    if (flag == 0) {
+        coremlEncoder = loadModel();
+        flag = 1;
+    }
+    predictWith(coremlEncoder, data, vec);
+}
+
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     clip_image_f32_batch imgs;
     clip_image_f32_ptr img_copy(clip_image_f32_init());
     *img_copy = *img;
     imgs.entries.push_back(std::move(img_copy));
 
+    bool ios_ctx = true;
+    if (ios_ctx){   
+        printf("clip use ane\n");
+        float * vit_embedding1 = (float *)malloc(1100*1152*sizeof(float));
+        float * vit_embedding2 = (float *)malloc(1100*1152*sizeof(float));
+
+        ane_embedding(ctx, n_threads, &imgs, vit_embedding1);
+        clip_image_encode_ane(vit_embedding1, vit_embedding2);
+        ane_resampler(ctx, n_threads, &imgs, vit_embedding2, vec);
+        free(vit_embedding1);
+        free(vit_embedding2);
+        return true;
+    }
+
     return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
 
+bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+    const clip_image_f32_batch & imgs = *imgs_c_ptr;
+    int batch_size = imgs.entries.size();
+
+    // TODO @ngxson : implement batch size > 1 as a loop
+    //                we don't need true batching support because the cgraph will gonna be big anyway
+    if (batch_size != 1) {
+        return false; // only support batch size of 1
+    }
+
+    // build the inference graph
+    ctx->debug_print_tensors.clear();
+    ggml_backend_sched_reset(ctx->sched.get());
+    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
+    clip_graph graph(ctx, *imgs.entries[0]);
+    ggml_cgraph * gf;
+    gf = graph.build_minicpmv_embedding();
+    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
+
+    // set inputs
+    const auto & model   = ctx->model;
+    const auto & hparams = model.hparams;
+
+    const int image_size_width  = imgs.entries[0]->nx;
+    const int image_size_height = imgs.entries[0]->ny;
+
+    const int patch_size    = hparams.patch_size;
+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
+    const int pos_w = image_size_width  / patch_size;
+    const int pos_h = image_size_height / patch_size;
+
+    auto get_inp_tensor = [&gf](const char * name) {
+        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
+        if (inp == nullptr) {
+            GGML_ABORT("Failed to get tensor %s", name);
+        }
+        if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
+            GGML_ABORT("Tensor %s is not an input tensor", name);
+        }
+        return inp;
+    };
+
+    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_F32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_I32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+    // set input pixel values
+    if (!imgs.is_audio) {
+        size_t nelem = 0;
+        for (const auto & img : imgs.entries) {
+            nelem += img->nx * img->ny * 3;
+        }
+        std::vector<float> inp_raw(nelem);
+
+        // layout of data (note: the channel dim is unrolled to better visualize the layout):
+        //
+        // ┌──W──┐
+        // │     H │  channel = R
+        // ├─────┤ │
+        // │     H │  channel = G
+        // ├─────┤ │
+        // │     H │  channel = B
+        // └─────┘ │
+        //   ──────┘ x B
+
+        for (size_t i = 0; i < imgs.entries.size(); i++) {
+            const int nx = imgs.entries[i]->nx;
+            const int ny = imgs.entries[i]->ny;
+            const int n = nx * ny;
+
+            for (int b = 0; b < batch_size; b++) {
+                float * batch_entry = inp_raw.data() + b * (3*n);
+                for (int y = 0; y < ny; y++) {
+                    for (int x = 0; x < nx; x++) {
+                        size_t base_src = 3*(y * nx + x); // idx of the first channel
+                        size_t base_dst =    y * nx + x;  // idx of the first channel
+                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
+                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
+                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
+                    }
+                }
+            }
+        }
+        set_input_f32("inp_raw", inp_raw);
+
+    } else {
+        // audio input
+        GGML_ASSERT(imgs.entries.size() == 1);
+        const auto & mel_inp = imgs.entries[0];
+        const int n_step = mel_inp->nx;
+        const int n_mel  = mel_inp->ny;
+        std::vector<float> inp_raw(n_step * n_mel);
+        std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
+        set_input_f32("inp_raw", inp_raw);
+    }
+
+    switch (ctx->model.proj_type) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                // inspired from siglip:
+                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
+                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
+                std::vector<int32_t> positions(pos_h * pos_w);
+                int bucket_coords_h[1024];
+                int bucket_coords_w[1024];
+                for (int i = 0; i < pos_h; i++){
+                    bucket_coords_h[i] = std::floor(70.0*i/pos_h);
+                }
+                for (int i = 0; i < pos_w; i++){
+                    bucket_coords_w[i] = std::floor(70.0*i/pos_w);
+                }
+                for (int i = 0, id = 0; i < pos_h; i++){
+                    for (int j = 0; j < pos_w; j++){
+                        positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+                    }
+                }
+                set_input_i32("positions", positions);
+            } break;
+            default:
+            GGML_ABORT("Unknown projector type");
+    }
+
+    // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+    ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
+    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+    if (reg) {
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
+        }
+    }
+
+    auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
+        return false;
+    }
+
+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
+    // the last node is the embedding tensor
+    ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+
+    // sanity check (only support batch size of 1 for now)
+    const int n_tokens_out = embeddings->ne[1];
+
+    // copy the embeddings to the location passed by the user
+    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+
+    return true;
+}
+
+bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) {
+    const clip_image_f32_batch & imgs = *imgs_c_ptr;
+    int batch_size = imgs.entries.size();
+
+    // TODO @ngxson : implement batch size > 1 as a loop
+    //                we don't need true batching support because the cgraph will gonna be big anyway
+    if (batch_size != 1) {
+        return false; // only support batch size of 1
+    }
+
+    // build the inference graph
+    ctx->debug_print_tensors.clear();
+    ggml_backend_sched_reset(ctx->sched.get());
+    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
+    clip_graph graph(ctx, *imgs.entries[0]);
+    ggml_cgraph * gf;
+    gf = graph.build_minicpmv_resampler();
+    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
+
+    // set inputs
+    const auto & model   = ctx->model;
+    const auto & hparams = model.hparams;
+
+    const int image_size_width  = imgs.entries[0]->nx;
+    const int image_size_height = imgs.entries[0]->ny;
+
+    const int patch_size    = hparams.patch_size;
+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
+    const int pos_w = image_size_width  / patch_size;
+    const int pos_h = image_size_height / patch_size;
+
+    auto get_inp_tensor = [&gf](const char * name) {
+        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
+        if (inp == nullptr) {
+            GGML_ABORT("Failed to get tensor %s", name);
+        }
+        if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
+            GGML_ABORT("Tensor %s is not an input tensor", name);
+        }
+        return inp;
+    };
+
+    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_F32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_I32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    {
+        struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
+        ggml_backend_tensor_set(embeddings, vit_embedding, 0, ggml_nbytes(embeddings));
+
+    }
+    
+    switch (ctx->model.proj_type) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                // inspired from resampler of Qwen-VL:
+                //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
+                //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
+                int embed_dim = clip_n_mmproj_embd(ctx);
+
+                // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
+                auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
+
+                std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
+                for(int i = 0; i < pos_w * pos_h; ++i){
+                    for(int j = 0; j < embed_dim; ++j){
+                        pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
+                    }
+                }
+
+                set_input_f32("pos_embed", pos_embed);
+            } break;
+            default:
+            GGML_ABORT("Unknown projector type");
+    }
+
+    // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+    ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
+    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+    if (reg) {
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
+        }
+    }
+
+    auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
+        return false;
+    }
+
+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
+    // the last node is the embedding tensor
+    ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+
+    // sanity check (only support batch size of 1 for now)
+    const int n_tokens_out = embeddings->ne[1];
+    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
+    if (n_tokens_out != expected_n_tokens_out) {
+        LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
+        GGML_ABORT("Invalid number of output tokens");
+    }
+
+    // copy the embeddings to the location passed by the user
+    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+
+    return true;
+}
+
+
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 08f3efb7b1daf..25ad15fd53687 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -95,6 +95,9 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
+bool ane_embedding(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+bool ane_resampler(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, const float * vit_embedding, float * vec);
+
 int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);

From 88988625e6bbc61c26b35e1295cceac597575093 Mon Sep 17 00:00:00 2001
From: tc_mb <caitianchi@modelbest.cn>
Date: Thu, 10 Jul 2025 16:08:44 +0800
Subject: [PATCH 03/15] test code for ios

---
 tools/mtmd/CMakeLists.txt |   3 +
 tools/mtmd/mtmd-ios.cpp   | 254 ++++++++++++++++++++++++++++++++++++++
 tools/mtmd/mtmd-ios.h     |  64 ++++++++++
 3 files changed, 321 insertions(+)
 create mode 100644 tools/mtmd/mtmd-ios.cpp
 create mode 100644 tools/mtmd/mtmd-ios.h

diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index e0e257ed9fd3d..58e47f20aae30 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -11,6 +11,8 @@ add_library(mtmd
             clip-impl.h
             mtmd-helper.cpp
             mtmd-helper.h
+            mtmd-ios.cpp
+            mtmd-ios.h
             )
 
 # 在Apple平台上添加ANE相关文件
@@ -53,6 +55,7 @@ endif()
 set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-ios.h
     )
 
 # 在Apple平台上添加ANE公共头文件
diff --git a/tools/mtmd/mtmd-ios.cpp b/tools/mtmd/mtmd-ios.cpp
new file mode 100644
index 0000000000000..6a76d2e5abdcb
--- /dev/null
+++ b/tools/mtmd/mtmd-ios.cpp
@@ -0,0 +1,254 @@
+#include "mtmd-ios.h"
+#include "arg.h"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "llama.h"
+#include "ggml.h"
+#include "chat.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+
+#include <vector>
+#include <string>
+#include <limits.h>
+#include <cinttypes>
+#include <memory>
+#include <cstring>
+#include <cstdlib>
+
+struct mtmd_ios_context {
+    mtmd::context_ptr ctx_vision;
+    common_init_result llama_init;
+    
+    llama_model* model;
+    llama_context* lctx;
+    const llama_vocab* vocab;
+    common_sampler* smpl;
+    llama_batch batch;
+    
+    mtmd::bitmaps bitmaps;
+    common_chat_templates_ptr tmpls;
+    
+    int n_threads;
+    llama_pos n_past;
+    int n_predict;
+    
+    std::string last_error;
+    
+    ~mtmd_ios_context() {
+        if (batch.token) {
+            llama_batch_free(batch);
+        }
+        if (smpl) {
+            common_sampler_free(smpl);
+        }
+    }
+};
+
+void mtmd_ios_string_free(char* str) {
+    if (str) {
+        free(str);
+    }
+}
+
+static void set_error(mtmd_ios_context* ctx, const std::string& error) {
+    ctx->last_error = error;
+}
+
+static bool load_media_from_buffer(mtmd_ios_context* ctx, const unsigned char* buffer, size_t size) {
+    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx->ctx_vision.get(), buffer, size));
+    if (!bmp.ptr) {
+        return false;
+    }
+    ctx->bitmaps.entries.push_back(std::move(bmp));
+    return true;
+}
+
+static int eval_message_internal(mtmd_ios_context* ctx, const common_chat_msg& msg, bool add_bos = false) {
+    common_chat_templates_inputs tmpl_inputs;
+    tmpl_inputs.messages = {msg};
+    tmpl_inputs.add_generation_prompt = true;
+    tmpl_inputs.use_jinja = false;
+    
+    auto formatted_chat = common_chat_templates_apply(ctx->tmpls.get(), tmpl_inputs);
+    
+    mtmd_input_text text;
+    text.text = formatted_chat.prompt.c_str();
+    text.add_special = add_bos;
+    text.parse_special = true;
+    
+    mtmd::input_chunks chunks(mtmd_input_chunks_init());
+    auto bitmaps_c_ptr = ctx->bitmaps.c_ptr();
+    int32_t res = mtmd_tokenize(ctx->ctx_vision.get(),
+                               chunks.ptr.get(),
+                               &text,
+                               bitmaps_c_ptr.data(),
+                               bitmaps_c_ptr.size());
+    if (res != 0) {
+        set_error(ctx, "Unable to tokenize prompt, res = " + std::to_string(res));
+        return 1;
+    }
+    
+    ctx->bitmaps.entries.clear();
+    
+    llama_pos new_n_past;
+    if (mtmd_helper_eval_chunks(ctx->ctx_vision.get(),
+                               ctx->lctx,
+                               chunks.ptr.get(),
+                               ctx->n_past,
+                               0,
+                               2048,
+                               true,
+                               &new_n_past)) {
+        set_error(ctx, "Unable to eval prompt");
+        return 1;
+    }
+    
+    ctx->n_past = new_n_past;
+    return 0;
+}
+
+mtmd_ios_params mtmd_ios_params_default(void) {
+    mtmd_ios_params params = {};
+    params.model_path = nullptr;
+    params.mmproj_path = nullptr;
+    params.n_predict = -1;
+    params.n_ctx = 4096;
+    params.n_threads = 4;
+    params.temperature = 0.2f;
+    params.use_gpu = true;
+    params.mmproj_use_gpu = true;
+    return params;
+}
+
+mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
+    if (!params || !params->model_path || !params->mmproj_path) {
+        return nullptr;
+    }
+    
+    ggml_time_init();
+    common_init();
+    
+    auto ctx = std::make_unique<mtmd_ios_context>();
+    
+    ctx->n_predict = params->n_predict;
+    ctx->n_threads = params->n_threads;
+    ctx->n_past = 0;
+    
+    common_params common_params;
+    common_params.model.path = params->model_path;
+    common_params.mmproj.path = params->mmproj_path;
+    common_params.n_ctx = params->n_ctx;
+    common_params.n_batch = 2048;
+    common_params.cpuparams.n_threads = params->n_threads;
+    common_params.sampling.temp = params->temperature;
+    common_params.mmproj_use_gpu = params->mmproj_use_gpu;
+    
+    ctx->llama_init = common_init_from_params(common_params);
+    ctx->model = ctx->llama_init.model.get();
+    ctx->lctx = ctx->llama_init.context.get();
+    ctx->vocab = llama_model_get_vocab(ctx->model);
+    ctx->smpl = common_sampler_init(ctx->model, common_params.sampling);
+    ctx->batch = llama_batch_init(1, 0, 1);
+    
+    if (!ctx->model || !ctx->lctx) {
+        set_error(ctx.get(), "Failed to load model or create context");
+        return nullptr;
+    }
+    
+    if (!llama_model_chat_template(ctx->model, nullptr)) {
+        set_error(ctx.get(), "Model does not have chat template");
+        return nullptr;
+    }
+    
+    ctx->tmpls = common_chat_templates_init(ctx->model, "");
+    
+    mtmd_context_params mparams = mtmd_context_params_default();
+    mparams.use_gpu = params->mmproj_use_gpu;
+    mparams.print_timings = false;
+    mparams.n_threads = params->n_threads;
+    mparams.verbosity = GGML_LOG_LEVEL_INFO;
+    
+    ctx->ctx_vision.reset(mtmd_init_from_file(params->mmproj_path, ctx->model, mparams));
+    if (!ctx->ctx_vision.get()) {
+        set_error(ctx.get(), "Failed to load vision model from " + std::string(params->mmproj_path));
+        return nullptr;
+    }
+    
+    return ctx.release();
+}
+
+void mtmd_ios_free(mtmd_ios_context* ctx) {
+    if (ctx) {
+        delete ctx;
+    }
+}
+
+char* mtmd_ios_generate(mtmd_ios_context* ctx, const mtmd_ios_message* message) {
+    if (!ctx || !message) {
+        return nullptr;
+    }
+    
+    for (int i = 0; i < message->n_images; i++) {
+        if (!load_media_from_buffer(ctx, message->image_buffers[i], message->image_sizes[i])) {
+            set_error(ctx, "Failed to load image");
+            return nullptr;
+        }
+    }
+    
+    for (int i = 0; i < message->n_audios; i++) {
+        if (!load_media_from_buffer(ctx, message->audio_buffers[i], message->audio_sizes[i])) {
+            set_error(ctx, "Failed to load audio");
+            return nullptr;
+        }
+    }
+    
+    std::string prompt = message->content;
+    if (prompt.find(mtmd_default_marker()) == std::string::npos) {
+        for (int i = 0; i < message->n_images + message->n_audios; i++) {
+            prompt += mtmd_default_marker();
+        }
+    }
+    
+    common_chat_msg msg;
+    msg.role = message->role;
+    msg.content = prompt;
+    
+    if (eval_message_internal(ctx, msg, true)) {
+        return nullptr;
+    }
+    
+    std::string response;
+    int n_predict = ctx->n_predict < 0 ? INT_MAX : ctx->n_predict;
+    
+    for (int i = 0; i < n_predict; i++) {
+        llama_token token_id = common_sampler_sample(ctx->smpl, ctx->lctx, -1);
+        common_sampler_accept(ctx->smpl, token_id, true);
+        
+        if (llama_vocab_is_eog(ctx->vocab, token_id)) {
+            break;
+        }
+        
+        std::string token_str = common_token_to_piece(ctx->lctx, token_id);
+        response += token_str;
+        
+        common_batch_clear(ctx->batch);
+        common_batch_add(ctx->batch, token_id, ctx->n_past++, {0}, true);
+        if (llama_decode(ctx->lctx, ctx->batch)) {
+            set_error(ctx, "failed to decode token");
+            return nullptr;
+        }
+    }
+    
+    char* result_cstr = (char*)malloc(response.length() + 1);
+    if (result_cstr) {
+        strcpy(result_cstr, response.c_str());
+    }
+    
+    return result_cstr;
+}
+
+const char* mtmd_ios_get_last_error(mtmd_ios_context* ctx) {
+    return ctx ? ctx->last_error.c_str() : nullptr;
+}
diff --git a/tools/mtmd/mtmd-ios.h b/tools/mtmd/mtmd-ios.h
new file mode 100644
index 0000000000000..20dc7cca4c909
--- /dev/null
+++ b/tools/mtmd/mtmd-ios.h
@@ -0,0 +1,64 @@
+#ifndef MTMD_IOS_H
+#define MTMD_IOS_H
+
+#include "mtmd.h"
+#include "mtmd-helper.h"
+#include "common.h"
+#include "sampling.h"
+#include "llama.h"
+#include "ggml.h"
+#include "chat.h"
+
+#include <string>
+#include <vector>
+#include <functional>
+#include <memory>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct mtmd_ios_context;
+struct mtmd_ios_params;
+struct mtmd_ios_message;
+
+struct mtmd_ios_params {
+    const char* model_path;
+    const char* mmproj_path;
+    
+    int n_predict;
+    int n_ctx;
+    int n_threads;
+    float temperature;
+    
+    bool use_gpu;
+    bool mmproj_use_gpu;
+};
+
+struct mtmd_ios_message {
+    const char* role;
+    const char* content;
+    const unsigned char** image_buffers;
+    size_t* image_sizes;
+    int n_images;
+    const unsigned char** audio_buffers;
+    size_t* audio_sizes;
+    int n_audios;
+};
+
+mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params);
+void mtmd_ios_free(mtmd_ios_context* ctx);
+
+mtmd_ios_params mtmd_ios_params_default(void);
+
+char* mtmd_ios_generate(mtmd_ios_context* ctx, const mtmd_ios_message* message);
+
+const char* mtmd_ios_get_last_error(mtmd_ios_context* ctx);
+
+void mtmd_ios_string_free(char* str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif 
\ No newline at end of file

From 999a87dd97f62f560b40119dea0e6878d751aa71 Mon Sep 17 00:00:00 2001
From: tc_mb <caitianchi@modelbest.cn>
Date: Wed, 16 Jul 2025 15:02:53 +0800
Subject: [PATCH 04/15] support app s1

---
 tools/mtmd/CMakeLists.txt                     |  36 ++-
 tools/mtmd/ane_minicpm4v3b_vision_f16_b1.h    | 154 +++++++++++
 tools/mtmd/ane_minicpm4v3b_vision_f16_b1.m    | 222 +++++++++++++++
 tools/mtmd/clip.cpp                           |   2 +-
 .../minicpmv-convert-image-encoder-to-gguf.py |   4 +-
 tools/mtmd/mtmd-ios-example.cpp               | 121 +++++++++
 tools/mtmd/mtmd-ios.cpp                       | 254 ++++++++++--------
 tools/mtmd/mtmd-ios.h                         |  23 +-
 8 files changed, 693 insertions(+), 123 deletions(-)
 create mode 100644 tools/mtmd/ane_minicpm4v3b_vision_f16_b1.h
 create mode 100644 tools/mtmd/ane_minicpm4v3b_vision_f16_b1.m
 create mode 100644 tools/mtmd/mtmd-ios-example.cpp

diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 58e47f20aae30..56d57656302d6 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -29,10 +29,14 @@ if(APPLE)
     set_source_files_properties(ane_minicpm4v3b_vision_f16_b1.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
 endif()
 
-target_link_libraries     (mtmd PUBLIC ggml llama)
+target_link_libraries     (mtmd PUBLIC ggml llama common)
 target_link_libraries     (mtmd PRIVATE Threads::Threads)
 target_include_directories(mtmd PUBLIC  .)
 target_include_directories(mtmd PRIVATE ../..)
+target_include_directories(mtmd PRIVATE ../../common)
+target_include_directories(mtmd PRIVATE ../../include)
+target_include_directories(mtmd PRIVATE ../../ggml/include)
+target_include_directories(mtmd PRIVATE ../../src)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
@@ -92,3 +96,33 @@ set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
 install                (TARGETS ${TARGET} RUNTIME)
 target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+# iOS接口测试
+set(TARGET_IOS llama-mtmd-ios-test)
+add_executable         (${TARGET_IOS} mtmd-ios-example.cpp)
+set_target_properties  (${TARGET_IOS} PROPERTIES OUTPUT_NAME llama-mtmd-ios-test)
+target_include_directories(${TARGET_IOS} PRIVATE .)
+target_include_directories(${TARGET_IOS} PRIVATE ../..)
+target_include_directories(${TARGET_IOS} PRIVATE ../../common)
+target_include_directories(${TARGET_IOS} PRIVATE ../../include)
+target_include_directories(${TARGET_IOS} PRIVATE ../../ggml/include)
+target_include_directories(${TARGET_IOS} PRIVATE ../../src)
+target_include_directories(${TARGET_IOS} PRIVATE ../../vendor)
+target_link_libraries  (${TARGET_IOS} PRIVATE mtmd common llama ggml Threads::Threads)
+target_compile_features(${TARGET_IOS} PRIVATE cxx_std_17)
+
+# 确保依赖库先构建
+add_dependencies(${TARGET_IOS} mtmd)
+if (TARGET common)
+    add_dependencies(${TARGET_IOS} common)
+endif()
+
+# 在Apple平台上为iOS测试添加额外的链接设置
+if(APPLE)
+    target_link_libraries(${TARGET_IOS} PRIVATE 
+        "-framework Foundation" 
+        "-framework CoreML" 
+        "-framework Accelerate"
+        "-ObjC"
+    )
+endif()
diff --git a/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.h b/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.h
new file mode 100644
index 0000000000000..214b9c5c134c9
--- /dev/null
+++ b/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.h
@@ -0,0 +1,154 @@
+//
+// ane_minicpm4v3b_vision_f16_b1.h
+//
+// This file was automatically generated and should not be edited.
+//
+
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#include <stdint.h>
+#include <os/log.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/// Model Prediction Input Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface ane_minicpm4v3b_vision_f16_b1Input : NSObject<MLFeatureProvider>
+
+/// input as 1 × 1024 × 1152 3-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * input;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithInput:(MLMultiArray *)input NS_DESIGNATED_INITIALIZER;
+
+@end
+
+/// Model Prediction Output Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface ane_minicpm4v3b_vision_f16_b1Output : NSObject<MLFeatureProvider>
+
+/// output as 1 × 1024 × 1152 3-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * output;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
+
+@end
+
+/// Class for model loading and prediction
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface ane_minicpm4v3b_vision_f16_b1 : NSObject
+@property (readonly, nonatomic, nullable) MLModel * model;
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
++ (nullable NSURL *)URLOfModelInThisBundle;
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of ane_minicpm4v3b_vision_f16_b1.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance with the model in this bundle.
+*/
+- (nullable instancetype)init;
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for ane_minicpm4v3b_vision_f16_b1.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for ane_minicpm4v3b_vision_f16_b1.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Construct ane_minicpm4v3b_vision_f16_b1 instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpm4v3b_vision_f16_b1 instance or NSError object.
+*/
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1 * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Construct ane_minicpm4v3b_vision_f16_b1 instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpm4v3b_vision_f16_b1 instance or NSError object.
+*/
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1 * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of ane_minicpm4v3b_vision_f16_b1Input to predict from
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as ane_minicpm4v3b_vision_f16_b1Output
+*/
+- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of ane_minicpm4v3b_vision_f16_b1Input to predict from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as ane_minicpm4v3b_vision_f16_b1Output
+*/
+- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make an asynchronous prediction using the standard interface
+    @param input an instance of ane_minicpm4v3b_vision_f16_b1Input to predict from
+    @param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
+*/
+- (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
+
+/**
+    Make an asynchronous prediction using the standard interface
+    @param input an instance of ane_minicpm4v3b_vision_f16_b1Input to predict from
+    @param options prediction options
+    @param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
+*/
+- (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
+
+/**
+    Make a prediction using the convenience interface
+    @param input 1 × 1024 × 1152 3-dimensional array of floats
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as ane_minicpm4v3b_vision_f16_b1Output
+*/
+- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Batch prediction
+    @param inputArray array of ane_minicpm4v3b_vision_f16_b1Input instances to obtain predictions from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the predictions as NSArray<ane_minicpm4v3b_vision_f16_b1Output *>
+*/
+- (nullable NSArray<ane_minicpm4v3b_vision_f16_b1Output *> *)predictionsFromInputs:(NSArray<ane_minicpm4v3b_vision_f16_b1Input*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.m b/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.m
new file mode 100644
index 0000000000000..14cb4e84ee169
--- /dev/null
+++ b/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.m
@@ -0,0 +1,222 @@
+//
+// ane_minicpm4v3b_vision_f16_b1.m
+//
+// This file was automatically generated and should not be edited.
+//
+
+#if !__has_feature(objc_arc)
+#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
+#endif
+
+#import "ane_minicpm4v3b_vision_f16_b1.h"
+
+@implementation ane_minicpm4v3b_vision_f16_b1Input
+
+- (instancetype)initWithInput:(MLMultiArray *)input {
+    self = [super init];
+    if (self) {
+        _input = input;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"input"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"input"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.input];
+    }
+    return nil;
+}
+
+@end
+
+@implementation ane_minicpm4v3b_vision_f16_b1Output
+
+- (instancetype)initWithOutput:(MLMultiArray *)output {
+    self = [super init];
+    if (self) {
+        _output = output;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"output"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"output"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.output];
+    }
+    return nil;
+}
+
+@end
+
+@implementation ane_minicpm4v3b_vision_f16_b1
+
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
++ (nullable NSURL *)URLOfModelInThisBundle {
+    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"ane_minicpm4v3b_vision_f16_b1" ofType:@"mlmodelc"];
+    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load ane_minicpm4v3b_vision_f16_b1.mlmodelc in the bundle resource"); return nil; }
+    return [NSURL fileURLWithPath:assetPath];
+}
+
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of ane_minicpm4v3b_vision_f16_b1.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model {
+    if (model == nil) {
+        return nil;
+    }
+    self = [super init];
+    if (self != nil) {
+        _model = model;
+    }
+    return self;
+}
+
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance with the model in this bundle.
+*/
+- (nullable instancetype)init {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
+}
+
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
+}
+
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for ane_minicpm4v3b_vision_f16_b1.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Initialize ane_minicpm4v3b_vision_f16_b1 instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for ane_minicpm4v3b_vision_f16_b1.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Construct ane_minicpm4v3b_vision_f16_b1 instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpm4v3b_vision_f16_b1 instance or NSError object.
+*/
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1 * _Nullable model, NSError * _Nullable error))handler {
+    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
+              configuration:configuration
+          completionHandler:handler];
+}
+
+
+/**
+    Construct ane_minicpm4v3b_vision_f16_b1 instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpm4v3b_vision_f16_b1 instance or NSError object.
+*/
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1 * _Nullable model, NSError * _Nullable error))handler {
+    [MLModel loadContentsOfURL:modelURL
+                 configuration:configuration
+             completionHandler:^(MLModel *model, NSError *error) {
+        if (model != nil) {
+            ane_minicpm4v3b_vision_f16_b1 *typedModel = [[ane_minicpm4v3b_vision_f16_b1 alloc] initWithMLModel:model];
+            handler(typedModel, nil);
+        } else {
+            handler(nil, error);
+        }
+    }];
+}
+
+- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
+}
+
+- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
+    if (!outFeatures) { return nil; }
+    return [[ane_minicpm4v3b_vision_f16_b1Output alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
+}
+
+- (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1Output * _Nullable output, NSError * _Nullable error))completionHandler {
+    [self.model predictionFromFeatures:input completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
+        if (prediction != nil) {
+            ane_minicpm4v3b_vision_f16_b1Output *output = [[ane_minicpm4v3b_vision_f16_b1Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
+            completionHandler(output, predictionError);
+        } else {
+            completionHandler(nil, predictionError);
+        }
+    }];
+}
+
+- (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1Output * _Nullable output, NSError * _Nullable error))completionHandler {
+    [self.model predictionFromFeatures:input options:options completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
+        if (prediction != nil) {
+            ane_minicpm4v3b_vision_f16_b1Output *output = [[ane_minicpm4v3b_vision_f16_b1Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
+            completionHandler(output, predictionError);
+        } else {
+            completionHandler(nil, predictionError);
+        }
+    }];
+}
+
+- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    ane_minicpm4v3b_vision_f16_b1Input *input_ = [[ane_minicpm4v3b_vision_f16_b1Input alloc] initWithInput:input];
+    return [self predictionFromFeatures:input_ error:error];
+}
+
+- (nullable NSArray<ane_minicpm4v3b_vision_f16_b1Output *> *)predictionsFromInputs:(NSArray<ane_minicpm4v3b_vision_f16_b1Input*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
+    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
+    if (!outBatch) { return nil; }
+    NSMutableArray<ane_minicpm4v3b_vision_f16_b1Output*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
+    for (NSInteger i = 0; i < outBatch.count; i++) {
+        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
+        ane_minicpm4v3b_vision_f16_b1Output * result = [[ane_minicpm4v3b_vision_f16_b1Output alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
+        [results addObject:result];
+    }
+    return results;
+}
+
+@end
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 502a6582010c0..e422019055ec0 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3817,7 +3817,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
     imgs.entries.push_back(std::move(img_copy));
 
     bool ios_ctx = true;
-    if (ios_ctx){   
+    if (ios_ctx){
         printf("clip use ane\n");
         float * vit_embedding1 = (float *)malloc(1100*1152*sizeof(float));
         float * vit_embedding2 = (float *)malloc(1100*1152*sizeof(float));
diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
index ff01bbdc01981..de08bb5000ceb 100644
--- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
+++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -497,8 +497,8 @@ def bytes_to_unicode():
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
-default_image_mean = [0.48145466, 0.4578275, 0.40821073]
-default_image_std = [0.26862954, 0.26130258, 0.27577711]
+default_image_mean = [0.5, 0.5, 0.5]
+default_image_std = [0.5, 0.5, 0.5]
 ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
 ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
 ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2)
diff --git a/tools/mtmd/mtmd-ios-example.cpp b/tools/mtmd/mtmd-ios-example.cpp
new file mode 100644
index 0000000000000..f4ba74c0ff0e3
--- /dev/null
+++ b/tools/mtmd/mtmd-ios-example.cpp
@@ -0,0 +1,121 @@
+#include "mtmd-ios.h"
+#include <iostream>
+#include <string>
+#include <cstdlib>
+
+void example_multiple_images_progressive() {
+    mtmd_ios_params params = mtmd_ios_params_default();
+    params.model_path = "/Users/tianchi/code/project/4o/3b/MiniCPM-4v-3b/model/ggml-model-Q4_0.gguf";
+    params.mmproj_path = "/Users/tianchi/code/project/4o/3b/MiniCPM-4v-3b/mmproj-model-f16.gguf";
+    params.n_predict = 100;  // 增加生成长度
+    params.temperature = 0.6f;
+    
+    mtmd_ios_context* ctx = mtmd_ios_init(&params);
+    if (!ctx) {
+        std::cerr << "Failed to initialize context\n";
+        return;
+    }
+    
+    std::cout << "=== 多轮多模态对话示例 ===\n";
+    std::cout << "命令说明：\n";
+    std::cout << "  /image <路径>  - 添加图片\n";
+    std::cout << "  /text <内容>   - 添加文本\n";
+    std::cout << "  /generate     - 生成响应\n";
+    std::cout << "  /quit         - 退出\n";
+    std::cout << "=============================\n\n";
+    
+    std::string input;
+    bool has_content = false;  // 跟踪是否有内容可以生成
+    
+    while (true) {
+        std::cout << "> ";
+        std::getline(std::cin, input);
+        
+        if (input.empty()) {
+            continue;
+        }
+        
+        if (input == "/quit") {
+            break;
+        }
+        
+        if (input == "/generate") {
+            if (!has_content) {
+                std::cout << "请先添加图片或文本内容\n";
+                continue;
+            }
+            
+            std::cout << "Assistant: ";
+            int token_count = 0;
+            while (true) {
+                mtmd_ios_token result = mtmd_ios_loop(ctx);
+                
+                if (result.is_end) {
+                    std::cout << "\n[生成完成 - " << token_count << " tokens]\n\n";
+                    break;
+                }
+                
+                if (result.token) {
+                    std::cout << result.token;
+                    std::cout.flush();
+                    mtmd_ios_string_free(result.token);
+                    token_count++;
+                }
+            }  
+            
+            has_content = false;  // 重置内容标志
+            continue;
+        }
+        
+        if (input.find("/image ") == 0) {
+            std::string image_path = input.substr(7);
+            if (image_path.empty()) {
+                std::cout << "请提供图片路径\n";
+                continue;
+            }
+            
+            std::cout << "正在加载图片: " << image_path << "\n";
+            if (mtmd_ios_prefill_image(ctx, image_path.c_str()) != 0) {
+                std::cerr << "Failed to load image: " << mtmd_ios_get_last_error(ctx) << "\n";
+            } else {
+                std::cout << "图片加载成功\n";
+                has_content = true;
+            }
+            continue;
+        }
+        
+        if (input.find("/text ") == 0) {
+            std::string text = input.substr(6);
+            if (text.empty()) {
+                std::cout << "请提供文本内容\n";
+                continue;
+            }
+            
+            std::cout << "正在添加文本: " << text << "\n";
+            if (mtmd_ios_prefill_text(ctx, text.c_str(), "user") != 0) {
+                std::cerr << "Failed to add text: " << mtmd_ios_get_last_error(ctx) << "\n";
+            } else {
+                std::cout << "文本添加成功\n";
+                has_content = true;
+            }
+            continue;
+        }
+        
+        // 如果不是命令，当作文本处理
+        std::cout << "正在添加文本: " << input << "\n";
+        if (mtmd_ios_prefill_text(ctx, input.c_str(), "user") != 0) {
+            std::cerr << "Failed to add text: " << mtmd_ios_get_last_error(ctx) << "\n";
+        } else {
+            std::cout << "文本添加成功\n";
+            has_content = true;
+        }
+    }
+    
+    std::cout << "对话结束\n";
+    mtmd_ios_free(ctx);
+}
+
+int main() {
+    example_multiple_images_progressive(); 
+    return 0;
+} 
\ No newline at end of file
diff --git a/tools/mtmd/mtmd-ios.cpp b/tools/mtmd/mtmd-ios.cpp
index 6a76d2e5abdcb..cabe0b5b5ad3b 100644
--- a/tools/mtmd/mtmd-ios.cpp
+++ b/tools/mtmd/mtmd-ios.cpp
@@ -16,6 +16,7 @@
 #include <memory>
 #include <cstring>
 #include <cstdlib>
+#include <iostream>
 
 struct mtmd_ios_context {
     mtmd::context_ptr ctx_vision;
@@ -56,59 +57,6 @@ static void set_error(mtmd_ios_context* ctx, const std::string& error) {
     ctx->last_error = error;
 }
 
-static bool load_media_from_buffer(mtmd_ios_context* ctx, const unsigned char* buffer, size_t size) {
-    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx->ctx_vision.get(), buffer, size));
-    if (!bmp.ptr) {
-        return false;
-    }
-    ctx->bitmaps.entries.push_back(std::move(bmp));
-    return true;
-}
-
-static int eval_message_internal(mtmd_ios_context* ctx, const common_chat_msg& msg, bool add_bos = false) {
-    common_chat_templates_inputs tmpl_inputs;
-    tmpl_inputs.messages = {msg};
-    tmpl_inputs.add_generation_prompt = true;
-    tmpl_inputs.use_jinja = false;
-    
-    auto formatted_chat = common_chat_templates_apply(ctx->tmpls.get(), tmpl_inputs);
-    
-    mtmd_input_text text;
-    text.text = formatted_chat.prompt.c_str();
-    text.add_special = add_bos;
-    text.parse_special = true;
-    
-    mtmd::input_chunks chunks(mtmd_input_chunks_init());
-    auto bitmaps_c_ptr = ctx->bitmaps.c_ptr();
-    int32_t res = mtmd_tokenize(ctx->ctx_vision.get(),
-                               chunks.ptr.get(),
-                               &text,
-                               bitmaps_c_ptr.data(),
-                               bitmaps_c_ptr.size());
-    if (res != 0) {
-        set_error(ctx, "Unable to tokenize prompt, res = " + std::to_string(res));
-        return 1;
-    }
-    
-    ctx->bitmaps.entries.clear();
-    
-    llama_pos new_n_past;
-    if (mtmd_helper_eval_chunks(ctx->ctx_vision.get(),
-                               ctx->lctx,
-                               chunks.ptr.get(),
-                               ctx->n_past,
-                               0,
-                               2048,
-                               true,
-                               &new_n_past)) {
-        set_error(ctx, "Unable to eval prompt");
-        return 1;
-    }
-    
-    ctx->n_past = new_n_past;
-    return 0;
-}
-
 mtmd_ios_params mtmd_ios_params_default(void) {
     mtmd_ios_params params = {};
     params.model_path = nullptr;
@@ -118,7 +66,6 @@ mtmd_ios_params mtmd_ios_params_default(void) {
     params.n_threads = 4;
     params.temperature = 0.2f;
     params.use_gpu = true;
-    params.mmproj_use_gpu = true;
     return params;
 }
 
@@ -140,29 +87,45 @@ mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
     common_params.model.path = params->model_path;
     common_params.mmproj.path = params->mmproj_path;
     common_params.n_ctx = params->n_ctx;
-    common_params.n_batch = 2048;
+    common_params.n_batch = 2048;  // 增加batch大小，与标准mtmd保持一致
     common_params.cpuparams.n_threads = params->n_threads;
     common_params.sampling.temp = params->temperature;
     common_params.mmproj_use_gpu = params->mmproj_use_gpu;
-    
+
     ctx->llama_init = common_init_from_params(common_params);
+    
     ctx->model = ctx->llama_init.model.get();
     ctx->lctx = ctx->llama_init.context.get();
-    ctx->vocab = llama_model_get_vocab(ctx->model);
-    ctx->smpl = common_sampler_init(ctx->model, common_params.sampling);
-    ctx->batch = llama_batch_init(1, 0, 1);
     
     if (!ctx->model || !ctx->lctx) {
         set_error(ctx.get(), "Failed to load model or create context");
         return nullptr;
     }
     
-    if (!llama_model_chat_template(ctx->model, nullptr)) {
-        set_error(ctx.get(), "Model does not have chat template");
+    ctx->vocab = llama_model_get_vocab(ctx->model);
+    
+    ctx->smpl = common_sampler_init(ctx->model, common_params.sampling);
+    if (!ctx->smpl) {
+        set_error(ctx.get(), "Failed to initialize sampler");
         return nullptr;
     }
     
-    ctx->tmpls = common_chat_templates_init(ctx->model, "");
+    ctx->batch = llama_batch_init(2048, 0, 1);
+    if (!ctx->batch.token) {
+        set_error(ctx.get(), "Failed to initialize batch");
+        return nullptr;
+    }
+    
+    std::string chat_template = "";
+    if (!llama_model_chat_template(ctx->model, nullptr)) {
+        chat_template = "chatml";
+    }
+    
+    ctx->tmpls = common_chat_templates_init(ctx->model, chat_template);
+    if (!ctx->tmpls) {
+        set_error(ctx.get(), "Failed to initialize chat templates");
+        return nullptr;
+    }
     
     mtmd_context_params mparams = mtmd_context_params_default();
     mparams.use_gpu = params->mmproj_use_gpu;
@@ -185,68 +148,145 @@ void mtmd_ios_free(mtmd_ios_context* ctx) {
     }
 }
 
-char* mtmd_ios_generate(mtmd_ios_context* ctx, const mtmd_ios_message* message) {
-    if (!ctx || !message) {
-        return nullptr;
+int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const char* image_path) {
+    if (!ctx || !image_path) {
+        return -1;
     }
     
-    for (int i = 0; i < message->n_images; i++) {
-        if (!load_media_from_buffer(ctx, message->image_buffers[i], message->image_sizes[i])) {
-            set_error(ctx, "Failed to load image");
-            return nullptr;
-        }
+    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx->ctx_vision.get(), image_path));
+    if (!bmp.ptr) {
+        set_error(ctx, "Failed to load image from file: " + std::string(image_path));
+        return -1;
     }
+    ctx->bitmaps.entries.push_back(std::move(bmp));
     
-    for (int i = 0; i < message->n_audios; i++) {
-        if (!load_media_from_buffer(ctx, message->audio_buffers[i], message->audio_sizes[i])) {
-            set_error(ctx, "Failed to load audio");
-            return nullptr;
-        }
+    mtmd_input_text text;
+    text.text = mtmd_default_marker();
+    text.add_special = ctx->n_past == 0;
+    text.parse_special = true;
+    
+    mtmd::input_chunks chunks(mtmd_input_chunks_init());
+    auto bitmaps_c_ptr = ctx->bitmaps.c_ptr();
+    int32_t res = mtmd_tokenize(ctx->ctx_vision.get(),
+                        chunks.ptr.get(),
+                        &text,
+                        bitmaps_c_ptr.data(),
+                        bitmaps_c_ptr.size());
+    if (res != 0) {
+        set_error(ctx, "Failed to tokenize image");
+        return -1;
     }
     
-    std::string prompt = message->content;
-    if (prompt.find(mtmd_default_marker()) == std::string::npos) {
-        for (int i = 0; i < message->n_images + message->n_audios; i++) {
-            prompt += mtmd_default_marker();
-        }
+    ctx->bitmaps.entries.clear();
+    
+    llama_pos new_n_past;
+    if (mtmd_helper_eval_chunks(ctx->ctx_vision.get(),
+                ctx->lctx,
+                chunks.ptr.get(),
+                ctx->n_past,
+                0,
+                1024,
+                false,
+                &new_n_past)) {
+        set_error(ctx, "Failed to eval image");
+        return -1;
+    }
+    
+    ctx->n_past = new_n_past;
+    
+    return 0;
+}
+
+
+
+int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const char* text, const char* role) {
+    if (!ctx || !text || !role) {
+        return -1;
     }
     
     common_chat_msg msg;
-    msg.role = message->role;
-    msg.content = prompt;
+    msg.role = role;
+    msg.content = text;
     
-    if (eval_message_internal(ctx, msg, true)) {
-        return nullptr;
+    common_chat_templates_inputs tmpl_inputs;
+    tmpl_inputs.messages = {msg};
+    tmpl_inputs.add_generation_prompt = false;
+    tmpl_inputs.use_jinja = false;
+    auto formatted_chat = common_chat_templates_apply(ctx->tmpls.get(), tmpl_inputs);
+    
+    mtmd_input_text input_text;
+    input_text.text = formatted_chat.prompt.c_str();
+    input_text.add_special = ctx->n_past == 0;
+    input_text.parse_special = true;
+    
+    mtmd::input_chunks chunks(mtmd_input_chunks_init());
+    int32_t res = mtmd_tokenize(ctx->ctx_vision.get(),
+                        chunks.ptr.get(),
+                        &input_text,
+                        nullptr,
+                        0);
+    if (res != 0) {
+        set_error(ctx, "Failed to tokenize text");
+        return -1;
     }
     
-    std::string response;
-    int n_predict = ctx->n_predict < 0 ? INT_MAX : ctx->n_predict;
+    llama_pos new_n_past;
+    if (mtmd_helper_eval_chunks(ctx->ctx_vision.get(),
+                ctx->lctx,
+                chunks.ptr.get(),
+                ctx->n_past,
+                0,
+                1024,
+                true,
+                &new_n_past)) {
+        set_error(ctx, "Failed to eval text");
+        return -1;
+    }
     
-    for (int i = 0; i < n_predict; i++) {
-        llama_token token_id = common_sampler_sample(ctx->smpl, ctx->lctx, -1);
-        common_sampler_accept(ctx->smpl, token_id, true);
-        
-        if (llama_vocab_is_eog(ctx->vocab, token_id)) {
-            break;
-        }
-        
-        std::string token_str = common_token_to_piece(ctx->lctx, token_id);
-        response += token_str;
-        
-        common_batch_clear(ctx->batch);
-        common_batch_add(ctx->batch, token_id, ctx->n_past++, {0}, true);
-        if (llama_decode(ctx->lctx, ctx->batch)) {
-            set_error(ctx, "failed to decode token");
-            return nullptr;
-        }
+    ctx->n_past = new_n_past;
+    return 0;
+}
+
+
+
+mtmd_ios_token mtmd_ios_loop(mtmd_ios_context* ctx) {
+    mtmd_ios_token result = {nullptr, true};
+    
+    if (!ctx) {
+        return result;
+    }
+    
+    llama_token token_id = common_sampler_sample(ctx->smpl, ctx->lctx, -1);
+    common_sampler_accept(ctx->smpl, token_id, true);
+    
+    if (llama_vocab_is_eog(ctx->vocab, token_id)) {
+        result.is_end = true;
+        return result;
+    }
+    
+    std::string token_str = common_token_to_piece(ctx->lctx, token_id);
+    
+    common_batch_clear(ctx->batch);
+    common_batch_add(ctx->batch, token_id, ctx->n_past, {0}, true);
+    
+    if (ctx->batch.n_tokens > 0) {
+        ctx->batch.logits[ctx->batch.n_tokens - 1] = true;
+    }
+    
+    ctx->n_past++;
+    if (llama_decode(ctx->lctx, ctx->batch)) {
+        set_error(ctx, "failed to decode token");
+        result.is_end = true;
+        return result;
     }
     
-    char* result_cstr = (char*)malloc(response.length() + 1);
-    if (result_cstr) {
-        strcpy(result_cstr, response.c_str());
+    result.token = (char*)malloc(token_str.length() + 1);
+    if (result.token) {
+        strcpy(result.token, token_str.c_str());
     }
+    result.is_end = false;
     
-    return result_cstr;
+    return result;
 }
 
 const char* mtmd_ios_get_last_error(mtmd_ios_context* ctx) {
diff --git a/tools/mtmd/mtmd-ios.h b/tools/mtmd/mtmd-ios.h
index 20dc7cca4c909..60aa83cd5b0ff 100644
--- a/tools/mtmd/mtmd-ios.h
+++ b/tools/mtmd/mtmd-ios.h
@@ -20,7 +20,6 @@ extern "C" {
 
 struct mtmd_ios_context;
 struct mtmd_ios_params;
-struct mtmd_ios_message;
 
 struct mtmd_ios_params {
     const char* model_path;
@@ -33,25 +32,25 @@ struct mtmd_ios_params {
     
     bool use_gpu;
     bool mmproj_use_gpu;
+    bool warmup;
 };
 
-struct mtmd_ios_message {
-    const char* role;
-    const char* content;
-    const unsigned char** image_buffers;
-    size_t* image_sizes;
-    int n_images;
-    const unsigned char** audio_buffers;
-    size_t* audio_sizes;
-    int n_audios;
-};
+
 
 mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params);
 void mtmd_ios_free(mtmd_ios_context* ctx);
 
 mtmd_ios_params mtmd_ios_params_default(void);
 
-char* mtmd_ios_generate(mtmd_ios_context* ctx, const mtmd_ios_message* message);
+int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const char* image_path);
+int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const char* text, const char* role);
+
+typedef struct {
+    char* token;
+    bool is_end;
+} mtmd_ios_token;
+
+mtmd_ios_token mtmd_ios_loop(mtmd_ios_context* ctx);
 
 const char* mtmd_ios_get_last_error(mtmd_ios_context* ctx);
 

From 4d32fd29c54417f7f2037efbcc38f77b3772d223 Mon Sep 17 00:00:00 2001
From: shiguoliang <shiguoliang@modelbest.cn>
Date: Fri, 18 Jul 2025 11:10:11 +0800
Subject: [PATCH 05/15] feat: mtmd support xcframework

---
 build-xcframework.sh                   |  13 +++-
 tools/batched-bench/CMakeLists.txt     |   2 +-
 tools/cvector-generator/CMakeLists.txt |   2 +-
 tools/export-lora/CMakeLists.txt       |   2 +-
 tools/gguf-split/CMakeLists.txt        |   2 +-
 tools/imatrix/CMakeLists.txt           |   2 +-
 tools/llama-bench/CMakeLists.txt       |   2 +-
 tools/main/CMakeLists.txt              |   2 +-
 tools/mtmd/CMakeLists.txt              |   2 +-
 tools/mtmd/mtmd-ios.cpp                |  24 +++---
 tools/mtmd/mtmd-ios.h                  | 101 +++++++++++++++----------
 tools/perplexity/CMakeLists.txt        |   2 +-
 tools/quantize/CMakeLists.txt          |   2 +-
 tools/run/CMakeLists.txt               |   2 +-
 tools/tokenize/CMakeLists.txt          |   2 +-
 tools/tts/CMakeLists.txt               |   2 +-
 16 files changed, 96 insertions(+), 68 deletions(-)

diff --git a/build-xcframework.sh b/build-xcframework.sh
index f813984db9dbd..8a8f2af41df56 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -8,7 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
 
 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_EXAMPLES=OFF
-LLAMA_BUILD_TOOLS=OFF
+LLAMA_BUILD_TOOLS=ON
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
 GGML_METAL=ON
@@ -124,6 +124,10 @@ setup_framework_structure() {
     cp ggml/include/ggml-cpu.h     ${header_path}
     cp ggml/include/ggml-blas.h    ${header_path}
     cp ggml/include/gguf.h         ${header_path}
+    # Copy mtmd-ios headers and dependencies
+    cp tools/mtmd/mtmd-ios.h        ${header_path}
+    cp tools/mtmd/mtmd.h            ${header_path}
+    cp tools/mtmd/mtmd-helper.h     ${header_path}
 
     # Create module map (common for all platforms)
     cat > ${module_path}module.modulemap << EOF
@@ -136,6 +140,9 @@ framework module llama {
     header "ggml-cpu.h"
     header "ggml-blas.h"
     header "gguf.h"
+    header "mtmd-ios.h"
+    header "mtmd.h"
+    header "mtmd-helper.h"
 
     link "c++"
     link framework "Accelerate"
@@ -252,6 +259,8 @@ combine_static_libraries() {
         "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
         "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
         "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
+        "${base_dir}/${build_dir}/common/${release_dir}/libcommon.a"
+        "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
     )
 
     # Create temporary directory for processing
@@ -327,7 +336,7 @@ combine_static_libraries() {
         $arch_flags \
         $min_version_flag \
         -Wl,-force_load,"${temp_dir}/combined.a" \
-        -framework Foundation -framework Metal -framework Accelerate \
+        -framework Foundation -framework Metal -framework Accelerate -framework CoreML \
         -install_name "$install_name" \
         -o "${base_dir}/${output_lib}"
 
diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt
index 68ad707f32c98..b8e652c979f13 100644
--- a/tools/batched-bench/CMakeLists.txt
+++ b/tools/batched-bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/cvector-generator/CMakeLists.txt b/tools/cvector-generator/CMakeLists.txt
index 49ad9561c82ea..e70a76523d8c4 100644
--- a/tools/cvector-generator/CMakeLists.txt
+++ b/tools/cvector-generator/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/export-lora/CMakeLists.txt b/tools/export-lora/CMakeLists.txt
index 310455787a7ef..69330896fd940 100644
--- a/tools/export-lora/CMakeLists.txt
+++ b/tools/export-lora/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/gguf-split/CMakeLists.txt b/tools/gguf-split/CMakeLists.txt
index c407e2f0af44a..bef3fb86f00fd 100644
--- a/tools/gguf-split/CMakeLists.txt
+++ b/tools/gguf-split/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt
index 412696c47c31c..73d7696dce6bb 100644
--- a/tools/imatrix/CMakeLists.txt
+++ b/tools/imatrix/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt
index 17e3b9b87bae4..de81c0bc5d460 100644
--- a/tools/llama-bench/CMakeLists.txt
+++ b/tools/llama-bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt
index af3d9150f8640..f380fcae3c2c8 100644
--- a/tools/main/CMakeLists.txt
+++ b/tools/main/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 56d57656302d6..d8ecb6ade93ce 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -93,7 +93,7 @@ add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
 set(TARGET llama-mtmd-cli)
 add_executable         (${TARGET} mtmd-cli.cpp)
 set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
-install                (TARGETS ${TARGET} RUNTIME)
+install                (TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
diff --git a/tools/mtmd/mtmd-ios.cpp b/tools/mtmd/mtmd-ios.cpp
index cabe0b5b5ad3b..6c75e6b27bdfa 100644
--- a/tools/mtmd/mtmd-ios.cpp
+++ b/tools/mtmd/mtmd-ios.cpp
@@ -59,8 +59,8 @@ static void set_error(mtmd_ios_context* ctx, const std::string& error) {
 
 mtmd_ios_params mtmd_ios_params_default(void) {
     mtmd_ios_params params = {};
-    params.model_path = nullptr;
-    params.mmproj_path = nullptr;
+    params.model_path = "";
+    params.mmproj_path = "";
     params.n_predict = -1;
     params.n_ctx = 4096;
     params.n_threads = 4;
@@ -70,7 +70,7 @@ mtmd_ios_params mtmd_ios_params_default(void) {
 }
 
 mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
-    if (!params || !params->model_path || !params->mmproj_path) {
+    if (!params || params->model_path.empty() || params->mmproj_path.empty()) {
         return nullptr;
     }
     
@@ -133,7 +133,7 @@ mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
     mparams.n_threads = params->n_threads;
     mparams.verbosity = GGML_LOG_LEVEL_INFO;
     
-    ctx->ctx_vision.reset(mtmd_init_from_file(params->mmproj_path, ctx->model, mparams));
+    ctx->ctx_vision.reset(mtmd_init_from_file(params->mmproj_path.c_str(), ctx->model, mparams));
     if (!ctx->ctx_vision.get()) {
         set_error(ctx.get(), "Failed to load vision model from " + std::string(params->mmproj_path));
         return nullptr;
@@ -148,14 +148,14 @@ void mtmd_ios_free(mtmd_ios_context* ctx) {
     }
 }
 
-int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const char* image_path) {
-    if (!ctx || !image_path) {
+int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const std::string& image_path) {
+    if (!ctx || image_path.empty()) {
         return -1;
     }
     
-    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx->ctx_vision.get(), image_path));
+    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx->ctx_vision.get(), image_path.c_str()));
     if (!bmp.ptr) {
-        set_error(ctx, "Failed to load image from file: " + std::string(image_path));
+        set_error(ctx, "Failed to load image from file: " + image_path);
         return -1;
     }
     ctx->bitmaps.entries.push_back(std::move(bmp));
@@ -199,14 +199,14 @@ int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const char* image_path) {
 
 
 
-int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const char* text, const char* role) {
-    if (!ctx || !text || !role) {
+int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const std::string& text, const std::string& role) {
+    if (!ctx || text.empty() || role.empty()) {
         return -1;
     }
     
     common_chat_msg msg;
-    msg.role = role;
-    msg.content = text;
+    msg.role = role.c_str();
+    msg.content = text.c_str();
     
     common_chat_templates_inputs tmpl_inputs;
     tmpl_inputs.messages = {msg};
diff --git a/tools/mtmd/mtmd-ios.h b/tools/mtmd/mtmd-ios.h
index 60aa83cd5b0ff..b71453b4b9e4e 100644
--- a/tools/mtmd/mtmd-ios.h
+++ b/tools/mtmd/mtmd-ios.h
@@ -1,63 +1,82 @@
 #ifndef MTMD_IOS_H
 #define MTMD_IOS_H
 
-#include "mtmd.h"
-#include "mtmd-helper.h"
-#include "common.h"
-#include "sampling.h"
-#include "llama.h"
-#include "ggml.h"
-#include "chat.h"
-
 #include <string>
-#include <vector>
-#include <functional>
-#include <memory>
+
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd-helper.h"
+#include "mtmd.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct mtmd_ios_context;
-struct mtmd_ios_params;
-
-struct mtmd_ios_params {
-    const char* model_path;
-    const char* mmproj_path;
-    
-    int n_predict;
-    int n_ctx;
-    int n_threads;
-    float temperature;
-    
-    bool use_gpu;
-    bool mmproj_use_gpu;
-    bool warmup;
-};
-
-
-
-mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params);
-void mtmd_ios_free(mtmd_ios_context* ctx);
-
+// Context structure
+typedef struct mtmd_ios_context mtmd_ios_context;
+
+// Parameters structure
+typedef struct mtmd_ios_params {
+    std::string model_path;
+    std::string mmproj_path;
+    int         n_predict;
+    int         n_ctx;
+    int         n_threads;
+    float       temperature;
+    bool        use_gpu;
+    bool        mmproj_use_gpu;
+    bool        warmup;
+} mtmd_ios_params;
+
+// Initialize, returns 0 on success, -1 on failure
+// Parameters:
+// params: parameters
+mtmd_ios_context * mtmd_ios_init(const mtmd_ios_params * params);
+
+// Free resources
+// Parameters:
+// ctx: context
+void mtmd_ios_free(mtmd_ios_context * ctx);
+
+// Get default parameters
 mtmd_ios_params mtmd_ios_params_default(void);
 
-int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const char* image_path);
-int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const char* text, const char* role);
+// Prefill image, returns 0 on success, -1 on failure
+// Parameters:
+// ctx: context
+// image_path: image path
+int mtmd_ios_prefill_image(mtmd_ios_context * ctx, const std::string & image_path);
 
+// Prefill text, returns 0 on success, -1 on failure
+// Parameters:
+// ctx: context
+// text: text
+// role: role
+int mtmd_ios_prefill_text(mtmd_ios_context * ctx, const std::string & text, const std::string & role);
+
+// Loop return value structure
 typedef struct {
-    char* token;
-    bool is_end;
+    char * token;
+    bool   is_end;
 } mtmd_ios_token;
 
-mtmd_ios_token mtmd_ios_loop(mtmd_ios_context* ctx);
+// Loop, returns 0 on success, -1 on failure
+// Parameters:
+// ctx: context
+mtmd_ios_token mtmd_ios_loop(mtmd_ios_context * ctx);
 
-const char* mtmd_ios_get_last_error(mtmd_ios_context* ctx);
+// Get last error message
+// Parameters:
+// ctx: context
+const char * mtmd_ios_get_last_error(mtmd_ios_context * ctx);
 
-void mtmd_ios_string_free(char* str);
+// Free string
+// Parameters:
+// str: string
+void mtmd_ios_string_free(char * str);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif 
\ No newline at end of file
+#endif
diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt
index 3e68640933afb..ed0825d8eda69 100644
--- a/tools/perplexity/CMakeLists.txt
+++ b/tools/perplexity/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index 47e5cbe30cfe3..a5575124aef20 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt
index 7cff188ca69f0..e86e1113e1128 100644
--- a/tools/run/CMakeLists.txt
+++ b/tools/run/CMakeLists.txt
@@ -11,6 +11,6 @@ if (LLAMA_CURL)
     set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()
 
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/tokenize/CMakeLists.txt b/tools/tokenize/CMakeLists.txt
index 1690b53e5d52b..f9dcd270d5f05 100644
--- a/tools/tokenize/CMakeLists.txt
+++ b/tools/tokenize/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-tokenize)
 add_executable(${TARGET} tokenize.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt
index c72bd814c3b31..da50e9bf848a8 100644
--- a/tools/tts/CMakeLists.txt
+++ b/tools/tts/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-tts)
 add_executable(${TARGET} tts.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

From 8775bd5bb02039218f78df534d5f3e740ebb5277 Mon Sep 17 00:00:00 2001
From: tc_mb <caitianchi@modelbest.cn>
Date: Sun, 20 Jul 2025 14:47:55 +0800
Subject: [PATCH 06/15] fix for app

---
 build-xcframework.sh                   |  13 +++-
 tools/batched-bench/CMakeLists.txt     |   2 +-
 tools/cvector-generator/CMakeLists.txt |   2 +-
 tools/export-lora/CMakeLists.txt       |   2 +-
 tools/gguf-split/CMakeLists.txt        |   2 +-
 tools/imatrix/CMakeLists.txt           |   2 +-
 tools/llama-bench/CMakeLists.txt       |   2 +-
 tools/main/CMakeLists.txt              |   2 +-
 tools/mtmd/CMakeLists.txt              |   2 +-
 tools/mtmd/mtmd-ios.cpp                |  24 +++---
 tools/mtmd/mtmd-ios.h                  | 101 +++++++++++++++----------
 tools/perplexity/CMakeLists.txt        |   2 +-
 tools/quantize/CMakeLists.txt          |   2 +-
 tools/run/CMakeLists.txt               |   2 +-
 tools/tokenize/CMakeLists.txt          |   2 +-
 tools/tts/CMakeLists.txt               |   2 +-
 16 files changed, 96 insertions(+), 68 deletions(-)

diff --git a/build-xcframework.sh b/build-xcframework.sh
index f813984db9dbd..8a8f2af41df56 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -8,7 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
 
 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_EXAMPLES=OFF
-LLAMA_BUILD_TOOLS=OFF
+LLAMA_BUILD_TOOLS=ON
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
 GGML_METAL=ON
@@ -124,6 +124,10 @@ setup_framework_structure() {
     cp ggml/include/ggml-cpu.h     ${header_path}
     cp ggml/include/ggml-blas.h    ${header_path}
     cp ggml/include/gguf.h         ${header_path}
+    # Copy mtmd-ios headers and dependencies
+    cp tools/mtmd/mtmd-ios.h        ${header_path}
+    cp tools/mtmd/mtmd.h            ${header_path}
+    cp tools/mtmd/mtmd-helper.h     ${header_path}
 
     # Create module map (common for all platforms)
     cat > ${module_path}module.modulemap << EOF
@@ -136,6 +140,9 @@ framework module llama {
     header "ggml-cpu.h"
     header "ggml-blas.h"
     header "gguf.h"
+    header "mtmd-ios.h"
+    header "mtmd.h"
+    header "mtmd-helper.h"
 
     link "c++"
     link framework "Accelerate"
@@ -252,6 +259,8 @@ combine_static_libraries() {
         "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
         "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
         "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
+        "${base_dir}/${build_dir}/common/${release_dir}/libcommon.a"
+        "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
     )
 
     # Create temporary directory for processing
@@ -327,7 +336,7 @@ combine_static_libraries() {
         $arch_flags \
         $min_version_flag \
         -Wl,-force_load,"${temp_dir}/combined.a" \
-        -framework Foundation -framework Metal -framework Accelerate \
+        -framework Foundation -framework Metal -framework Accelerate -framework CoreML \
         -install_name "$install_name" \
         -o "${base_dir}/${output_lib}"
 
diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt
index 68ad707f32c98..b8e652c979f13 100644
--- a/tools/batched-bench/CMakeLists.txt
+++ b/tools/batched-bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/cvector-generator/CMakeLists.txt b/tools/cvector-generator/CMakeLists.txt
index 49ad9561c82ea..e70a76523d8c4 100644
--- a/tools/cvector-generator/CMakeLists.txt
+++ b/tools/cvector-generator/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/export-lora/CMakeLists.txt b/tools/export-lora/CMakeLists.txt
index 310455787a7ef..69330896fd940 100644
--- a/tools/export-lora/CMakeLists.txt
+++ b/tools/export-lora/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/gguf-split/CMakeLists.txt b/tools/gguf-split/CMakeLists.txt
index c407e2f0af44a..bef3fb86f00fd 100644
--- a/tools/gguf-split/CMakeLists.txt
+++ b/tools/gguf-split/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt
index 412696c47c31c..73d7696dce6bb 100644
--- a/tools/imatrix/CMakeLists.txt
+++ b/tools/imatrix/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt
index 17e3b9b87bae4..de81c0bc5d460 100644
--- a/tools/llama-bench/CMakeLists.txt
+++ b/tools/llama-bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt
index af3d9150f8640..f380fcae3c2c8 100644
--- a/tools/main/CMakeLists.txt
+++ b/tools/main/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 56d57656302d6..d8ecb6ade93ce 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -93,7 +93,7 @@ add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
 set(TARGET llama-mtmd-cli)
 add_executable         (${TARGET} mtmd-cli.cpp)
 set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
-install                (TARGETS ${TARGET} RUNTIME)
+install                (TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
diff --git a/tools/mtmd/mtmd-ios.cpp b/tools/mtmd/mtmd-ios.cpp
index cabe0b5b5ad3b..6c75e6b27bdfa 100644
--- a/tools/mtmd/mtmd-ios.cpp
+++ b/tools/mtmd/mtmd-ios.cpp
@@ -59,8 +59,8 @@ static void set_error(mtmd_ios_context* ctx, const std::string& error) {
 
 mtmd_ios_params mtmd_ios_params_default(void) {
     mtmd_ios_params params = {};
-    params.model_path = nullptr;
-    params.mmproj_path = nullptr;
+    params.model_path = "";
+    params.mmproj_path = "";
     params.n_predict = -1;
     params.n_ctx = 4096;
     params.n_threads = 4;
@@ -70,7 +70,7 @@ mtmd_ios_params mtmd_ios_params_default(void) {
 }
 
 mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
-    if (!params || !params->model_path || !params->mmproj_path) {
+    if (!params || params->model_path.empty() || params->mmproj_path.empty()) {
         return nullptr;
     }
     
@@ -133,7 +133,7 @@ mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
     mparams.n_threads = params->n_threads;
     mparams.verbosity = GGML_LOG_LEVEL_INFO;
     
-    ctx->ctx_vision.reset(mtmd_init_from_file(params->mmproj_path, ctx->model, mparams));
+    ctx->ctx_vision.reset(mtmd_init_from_file(params->mmproj_path.c_str(), ctx->model, mparams));
     if (!ctx->ctx_vision.get()) {
         set_error(ctx.get(), "Failed to load vision model from " + std::string(params->mmproj_path));
         return nullptr;
@@ -148,14 +148,14 @@ void mtmd_ios_free(mtmd_ios_context* ctx) {
     }
 }
 
-int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const char* image_path) {
-    if (!ctx || !image_path) {
+int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const std::string& image_path) {
+    if (!ctx || image_path.empty()) {
         return -1;
     }
     
-    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx->ctx_vision.get(), image_path));
+    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx->ctx_vision.get(), image_path.c_str()));
     if (!bmp.ptr) {
-        set_error(ctx, "Failed to load image from file: " + std::string(image_path));
+        set_error(ctx, "Failed to load image from file: " + image_path);
         return -1;
     }
     ctx->bitmaps.entries.push_back(std::move(bmp));
@@ -199,14 +199,14 @@ int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const char* image_path) {
 
 
 
-int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const char* text, const char* role) {
-    if (!ctx || !text || !role) {
+int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const std::string& text, const std::string& role) {
+    if (!ctx || text.empty() || role.empty()) {
         return -1;
     }
     
     common_chat_msg msg;
-    msg.role = role;
-    msg.content = text;
+    msg.role = role.c_str();
+    msg.content = text.c_str();
     
     common_chat_templates_inputs tmpl_inputs;
     tmpl_inputs.messages = {msg};
diff --git a/tools/mtmd/mtmd-ios.h b/tools/mtmd/mtmd-ios.h
index 60aa83cd5b0ff..b71453b4b9e4e 100644
--- a/tools/mtmd/mtmd-ios.h
+++ b/tools/mtmd/mtmd-ios.h
@@ -1,63 +1,82 @@
 #ifndef MTMD_IOS_H
 #define MTMD_IOS_H
 
-#include "mtmd.h"
-#include "mtmd-helper.h"
-#include "common.h"
-#include "sampling.h"
-#include "llama.h"
-#include "ggml.h"
-#include "chat.h"
-
 #include <string>
-#include <vector>
-#include <functional>
-#include <memory>
+
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd-helper.h"
+#include "mtmd.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct mtmd_ios_context;
-struct mtmd_ios_params;
-
-struct mtmd_ios_params {
-    const char* model_path;
-    const char* mmproj_path;
-    
-    int n_predict;
-    int n_ctx;
-    int n_threads;
-    float temperature;
-    
-    bool use_gpu;
-    bool mmproj_use_gpu;
-    bool warmup;
-};
-
-
-
-mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params);
-void mtmd_ios_free(mtmd_ios_context* ctx);
-
+// Context structure
+typedef struct mtmd_ios_context mtmd_ios_context;
+
+// Parameters structure
+typedef struct mtmd_ios_params {
+    std::string model_path;
+    std::string mmproj_path;
+    int         n_predict;
+    int         n_ctx;
+    int         n_threads;
+    float       temperature;
+    bool        use_gpu;
+    bool        mmproj_use_gpu;
+    bool        warmup;
+} mtmd_ios_params;
+
+// Initialize, returns 0 on success, -1 on failure
+// Parameters:
+// params: parameters
+mtmd_ios_context * mtmd_ios_init(const mtmd_ios_params * params);
+
+// Free resources
+// Parameters:
+// ctx: context
+void mtmd_ios_free(mtmd_ios_context * ctx);
+
+// Get default parameters
 mtmd_ios_params mtmd_ios_params_default(void);
 
-int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const char* image_path);
-int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const char* text, const char* role);
+// Prefill image, returns 0 on success, -1 on failure
+// Parameters:
+// ctx: context
+// image_path: image path
+int mtmd_ios_prefill_image(mtmd_ios_context * ctx, const std::string & image_path);
 
+// Prefill text, returns 0 on success, -1 on failure
+// Parameters:
+// ctx: context
+// text: text
+// role: role
+int mtmd_ios_prefill_text(mtmd_ios_context * ctx, const std::string & text, const std::string & role);
+
+// Loop return value structure
 typedef struct {
-    char* token;
-    bool is_end;
+    char * token;
+    bool   is_end;
 } mtmd_ios_token;
 
-mtmd_ios_token mtmd_ios_loop(mtmd_ios_context* ctx);
+// Loop, returns 0 on success, -1 on failure
+// Parameters:
+// ctx: context
+mtmd_ios_token mtmd_ios_loop(mtmd_ios_context * ctx);
 
-const char* mtmd_ios_get_last_error(mtmd_ios_context* ctx);
+// Get last error message
+// Parameters:
+// ctx: context
+const char * mtmd_ios_get_last_error(mtmd_ios_context * ctx);
 
-void mtmd_ios_string_free(char* str);
+// Free string
+// Parameters:
+// str: string
+void mtmd_ios_string_free(char * str);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif 
\ No newline at end of file
+#endif
diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt
index 3e68640933afb..ed0825d8eda69 100644
--- a/tools/perplexity/CMakeLists.txt
+++ b/tools/perplexity/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index 47e5cbe30cfe3..a5575124aef20 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt
index 7cff188ca69f0..e86e1113e1128 100644
--- a/tools/run/CMakeLists.txt
+++ b/tools/run/CMakeLists.txt
@@ -11,6 +11,6 @@ if (LLAMA_CURL)
     set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()
 
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/tokenize/CMakeLists.txt b/tools/tokenize/CMakeLists.txt
index 1690b53e5d52b..f9dcd270d5f05 100644
--- a/tools/tokenize/CMakeLists.txt
+++ b/tools/tokenize/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-tokenize)
 add_executable(${TARGET} tokenize.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt
index c72bd814c3b31..da50e9bf848a8 100644
--- a/tools/tts/CMakeLists.txt
+++ b/tools/tts/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-tts)
 add_executable(${TARGET} tts.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

From 2e7bcd36ae9b4824e05ee5ebcecb81e52c91d711 Mon Sep 17 00:00:00 2001
From: tc_mb <caitianchi@modelbest.cn>
Date: Sun, 20 Jul 2025 14:58:13 +0800
Subject: [PATCH 07/15] temp no use slice

---
 tools/mtmd/clip.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index e422019055ec0..730429fa65b05 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3186,6 +3186,7 @@ struct llava_uhd {
         const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
         const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
 
+        has_slices = false;
         if (!has_slices) {
             // skip slicing logic
             res.overview_size = clip_image_size{slice_size, slice_size};

From d4f0cfe14653bad0faecdcf877803e12bb84fd62 Mon Sep 17 00:00:00 2001
From: tc_mb <caitianchi@modelbest.cn>
Date: Sun, 20 Jul 2025 15:04:25 +0800
Subject: [PATCH 08/15] fix no use slice

---
 tools/mtmd/clip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 730429fa65b05..5b8285190d7e7 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3183,7 +3183,7 @@ struct llava_uhd {
         const int original_width  = original_size.width;
         const int original_height = original_size.height;
 
-        const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
+        bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
         const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
 
         has_slices = false;

From ea6910375d636b78e55438792cff64c378f09509 Mon Sep 17 00:00:00 2001
From: shiguoliang <shiguoliang@modelbest.cn>
Date: Mon, 21 Jul 2025 19:42:09 +0800
Subject: [PATCH 09/15] feat: add clean kv cache

---
 tools/mtmd/mtmd-ios.cpp | 16 ++++++++++++++++
 tools/mtmd/mtmd-ios.h   | 27 ++++++++++++++++-----------
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/tools/mtmd/mtmd-ios.cpp b/tools/mtmd/mtmd-ios.cpp
index 6c75e6b27bdfa..bf8de2ac1cb65 100644
--- a/tools/mtmd/mtmd-ios.cpp
+++ b/tools/mtmd/mtmd-ios.cpp
@@ -149,6 +149,7 @@ void mtmd_ios_free(mtmd_ios_context* ctx) {
 }
 
 int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const std::string& image_path) {
+
     if (!ctx || image_path.empty()) {
         return -1;
     }
@@ -292,3 +293,18 @@ mtmd_ios_token mtmd_ios_loop(mtmd_ios_context* ctx) {
 const char* mtmd_ios_get_last_error(mtmd_ios_context* ctx) {
     return ctx ? ctx->last_error.c_str() : nullptr;
 }
+
+bool mtmd_ios_clean_kv_cache(mtmd_ios_context* ctx) {
+    if (!ctx) {
+        return false;
+    }
+ 
+    // 清理 kv-cache 并重置序列位置
+    ctx->n_past = 0;
+    llama_kv_self_seq_rm(ctx->lctx, 0, 0, -1);
+
+    // 清理batch状态
+    common_batch_clear(ctx->batch);
+    
+    return true;
+}
\ No newline at end of file
diff --git a/tools/mtmd/mtmd-ios.h b/tools/mtmd/mtmd-ios.h
index b71453b4b9e4e..92a1f0e47c627 100644
--- a/tools/mtmd/mtmd-ios.h
+++ b/tools/mtmd/mtmd-ios.h
@@ -8,14 +8,10 @@
 #include "mtmd-helper.h"
 #include "mtmd.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 // Context structure
 typedef struct mtmd_ios_context mtmd_ios_context;
 
-// Parameters structure
+// Parameters structure (C++ only)
 typedef struct mtmd_ios_params {
     std::string model_path;
     std::string mmproj_path;
@@ -28,6 +24,16 @@ typedef struct mtmd_ios_params {
     bool        warmup;
 } mtmd_ios_params;
 
+// Loop return value structure (C++ only)
+typedef struct {
+    char * token;
+    bool   is_end;
+} mtmd_ios_token;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 // Initialize, returns 0 on success, -1 on failure
 // Parameters:
 // params: parameters
@@ -54,12 +60,6 @@ int mtmd_ios_prefill_image(mtmd_ios_context * ctx, const std::string & image_pat
 // role: role
 int mtmd_ios_prefill_text(mtmd_ios_context * ctx, const std::string & text, const std::string & role);
 
-// Loop return value structure
-typedef struct {
-    char * token;
-    bool   is_end;
-} mtmd_ios_token;
-
 // Loop, returns 0 on success, -1 on failure
 // Parameters:
 // ctx: context
@@ -75,6 +75,11 @@ const char * mtmd_ios_get_last_error(mtmd_ios_context * ctx);
 // str: string
 void mtmd_ios_string_free(char * str);
 
+// Clean kv-cache
+// Parameters:
+// ctx: context
+bool mtmd_ios_clean_kv_cache(mtmd_ios_context * ctx);
+
 #ifdef __cplusplus
 }
 #endif

From 2fd1ef7f814de5aeb99c814153d48c2419f80db2 Mon Sep 17 00:00:00 2001
From: tc-mb <caitianchi@modelbest.cn>
Date: Thu, 31 Jul 2025 18:44:33 +0800
Subject: [PATCH 10/15] rename ane

---
 tools/mtmd/CMakeLists.txt                     | 14 ++--
 tools/mtmd/{ => ane}/ane.h                    |  0
 tools/mtmd/{ => ane}/ane.mm                   |  8 +--
 .../ane_minicpmv4_vit_f16.h}                  | 66 +++++++++---------
 .../ane_minicpmv4_vit_f16.m}                  | 68 +++++++++----------
 tools/mtmd/clip.cpp                           | 13 +++-
 .../minicpmv-convert-image-encoder-to-gguf.py |  8 ++-
 tools/mtmd/mtmd-ios-example.cpp               |  4 +-
 8 files changed, 96 insertions(+), 85 deletions(-)
 rename tools/mtmd/{ => ane}/ane.h (100%)
 rename tools/mtmd/{ => ane}/ane.mm (81%)
 rename tools/mtmd/{ane_minicpm4v3b_vision_f16_b1.h => ane/ane_minicpmv4_vit_f16.h} (60%)
 rename tools/mtmd/{ane_minicpm4v3b_vision_f16_b1.m => ane/ane_minicpmv4_vit_f16.m} (63%)

diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index d8ecb6ade93ce..c2643875664e0 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -18,15 +18,15 @@ add_library(mtmd
 # 在Apple平台上添加ANE相关文件
 if(APPLE)
     target_sources(mtmd PRIVATE
-        ane.h
-        ane.mm
-        ane_minicpm4v3b_vision_f16_b1.h
-        ane_minicpm4v3b_vision_f16_b1.m
+        ane/ane.h
+        ane/ane.mm
+        ane/ane_minicpmv4_vit_f16.h
+        ane/ane_minicpmv4_vit_f16.m
     )
     
     # 为Objective-C文件启用ARC
-    set_source_files_properties(ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
-    set_source_files_properties(ane_minicpm4v3b_vision_f16_b1.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+    set_source_files_properties(ane/ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+    set_source_files_properties(ane/ane_minicpmv4_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
 endif()
 
 target_link_libraries     (mtmd PUBLIC ggml llama common)
@@ -65,7 +65,7 @@ set(MTMD_PUBLIC_HEADERS
 # 在Apple平台上添加ANE公共头文件
 if(APPLE)
     list(APPEND MTMD_PUBLIC_HEADERS
-        ${CMAKE_CURRENT_SOURCE_DIR}/ane.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ane/ane.h
     )
 endif()
 
diff --git a/tools/mtmd/ane.h b/tools/mtmd/ane/ane.h
similarity index 100%
rename from tools/mtmd/ane.h
rename to tools/mtmd/ane/ane.h
diff --git a/tools/mtmd/ane.mm b/tools/mtmd/ane/ane.mm
similarity index 81%
rename from tools/mtmd/ane.mm
rename to tools/mtmd/ane/ane.mm
index 8f92641068df0..0063604341a0d 100644
--- a/tools/mtmd/ane.mm
+++ b/tools/mtmd/ane/ane.mm
@@ -1,7 +1,7 @@
 #import <CoreML/CoreML.h>
 #import <Accelerate/Accelerate.h>
 #import "ane.h"
-#import "ane_minicpm4v3b_vision_f16_b1.h"
+#import "ane_minicpmv4_vit_f16.h"
 #include <stdlib.h>
 
 #if __cplusplus
@@ -14,12 +14,12 @@
     NSFileManager *fileManager = [NSFileManager defaultManager];
     // 获取应用的 Documents 目录的 URL
     NSURL *documentsURL = [[fileManager URLsForDirectory:NSDocumentDirectory inDomains:NSUserDomainMask] firstObject];
-    NSString *pathString = [documentsURL.absoluteString stringByAppendingString:@"ane_minicpm4v3b_vision_f16_b1.mlmodelc"];
+    NSString *pathString = [documentsURL.absoluteString stringByAppendingString:@"ane_minicpmv4_vit_f16.mlmodelc"];
     NSURL *modelURL = [NSURL URLWithString:pathString];
 
     NSLog(modelURL.absoluteString);
 
-    const void* model = CFBridgingRetain([[ane_minicpm4v3b_vision_f16_b1 alloc] initWithContentsOfURL:modelURL error:nil]);
+    const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:nil]);
     return model;
 }
 
@@ -31,7 +31,7 @@ void predictWith(const void* model, float* embed, float* encoderOutput) {
                                                                 deallocator: nil
                                                                       error: nil];
 
-    ane_minicpm4v3b_vision_f16_b1Output *modelOutput = [(__bridge id)model predictionFromInput:inMultiArray error:nil];
+    ane_minicpmv4_vit_f16Output *modelOutput = [(__bridge id)model predictionFromInput:inMultiArray error:nil];
 
     MLMultiArray *outMA = modelOutput.output;
 
diff --git a/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.h b/tools/mtmd/ane/ane_minicpmv4_vit_f16.h
similarity index 60%
rename from tools/mtmd/ane_minicpm4v3b_vision_f16_b1.h
rename to tools/mtmd/ane/ane_minicpmv4_vit_f16.h
index 214b9c5c134c9..d6b9a29e857ab 100644
--- a/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.h
+++ b/tools/mtmd/ane/ane_minicpmv4_vit_f16.h
@@ -1,5 +1,5 @@
 //
-// ane_minicpm4v3b_vision_f16_b1.h
+// ane_minicpmv4_vit_f16.h
 //
 // This file was automatically generated and should not be edited.
 //
@@ -13,7 +13,7 @@ NS_ASSUME_NONNULL_BEGIN
 
 /// Model Prediction Input Type
 API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
-@interface ane_minicpm4v3b_vision_f16_b1Input : NSObject<MLFeatureProvider>
+@interface ane_minicpmv4_vit_f16Input : NSObject<MLFeatureProvider>
 
 /// input as 1 × 1024 × 1152 3-dimensional array of floats
 @property (readwrite, nonatomic, strong) MLMultiArray * input;
@@ -24,7 +24,7 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
 
 /// Model Prediction Output Type
 API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
-@interface ane_minicpm4v3b_vision_f16_b1Output : NSObject<MLFeatureProvider>
+@interface ane_minicpmv4_vit_f16Output : NSObject<MLFeatureProvider>
 
 /// output as 1 × 1024 × 1152 3-dimensional array of floats
 @property (readwrite, nonatomic, strong) MLMultiArray * output;
@@ -35,7 +35,7 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
 
 /// Class for model loading and prediction
 API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
-@interface ane_minicpm4v3b_vision_f16_b1 : NSObject
+@interface ane_minicpmv4_vit_f16 : NSObject
 @property (readonly, nonatomic, nullable) MLModel * model;
 
 /**
@@ -44,20 +44,20 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
 + (nullable NSURL *)URLOfModelInThisBundle;
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance from an existing MLModel object.
+    Initialize ane_minicpmv4_vit_f16 instance from an existing MLModel object.
 
-    Usually the application does not use this initializer unless it makes a subclass of ane_minicpm4v3b_vision_f16_b1.
+    Usually the application does not use this initializer unless it makes a subclass of ane_minicpmv4_vit_f16.
     Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance with the model in this bundle.
+    Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle.
 */
 - (nullable instancetype)init;
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance with the model in this bundle.
+    Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle.
 
     @param configuration The model configuration object
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@@ -65,90 +65,90 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
 - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance from the model URL.
+    Initialize ane_minicpmv4_vit_f16 instance from the model URL.
 
-    @param modelURL URL to the .mlmodelc directory for ane_minicpm4v3b_vision_f16_b1.
+    @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16.
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance from the model URL.
+    Initialize ane_minicpmv4_vit_f16 instance from the model URL.
 
-    @param modelURL URL to the .mlmodelc directory for ane_minicpm4v3b_vision_f16_b1.
+    @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16.
     @param configuration The model configuration object
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
-    Construct ane_minicpm4v3b_vision_f16_b1 instance asynchronously with configuration.
+    Construct ane_minicpmv4_vit_f16 instance asynchronously with configuration.
     Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
 
     @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpm4v3b_vision_f16_b1 instance or NSError object.
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object.
 */
-+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1 * _Nullable model, NSError * _Nullable error))handler;
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler;
 
 /**
-    Construct ane_minicpm4v3b_vision_f16_b1 instance asynchronously with URL of .mlmodelc directory and optional configuration.
+    Construct ane_minicpmv4_vit_f16 instance asynchronously with URL of .mlmodelc directory and optional configuration.
 
     Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
 
     @param modelURL The model URL.
     @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpm4v3b_vision_f16_b1 instance or NSError object.
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object.
 */
-+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1 * _Nullable model, NSError * _Nullable error))handler;
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler;
 
 /**
     Make a prediction using the standard interface
-    @param input an instance of ane_minicpm4v3b_vision_f16_b1Input to predict from
+    @param input an instance of ane_minicpmv4_vit_f16Input to predict from
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as ane_minicpm4v3b_vision_f16_b1Output
+    @return the prediction as ane_minicpmv4_vit_f16Output
 */
-- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
     Make a prediction using the standard interface
-    @param input an instance of ane_minicpm4v3b_vision_f16_b1Input to predict from
+    @param input an instance of ane_minicpmv4_vit_f16Input to predict from
     @param options prediction options
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as ane_minicpm4v3b_vision_f16_b1Output
+    @return the prediction as ane_minicpmv4_vit_f16Output
 */
-- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
     Make an asynchronous prediction using the standard interface
-    @param input an instance of ane_minicpm4v3b_vision_f16_b1Input to predict from
+    @param input an instance of ane_minicpmv4_vit_f16Input to predict from
     @param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
 */
-- (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
+- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
 
 /**
     Make an asynchronous prediction using the standard interface
-    @param input an instance of ane_minicpm4v3b_vision_f16_b1Input to predict from
+    @param input an instance of ane_minicpmv4_vit_f16Input to predict from
     @param options prediction options
     @param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
 */
-- (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
+- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
 
 /**
     Make a prediction using the convenience interface
     @param input 1 × 1024 × 1152 3-dimensional array of floats
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as ane_minicpm4v3b_vision_f16_b1Output
+    @return the prediction as ane_minicpmv4_vit_f16Output
 */
-- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
     Batch prediction
-    @param inputArray array of ane_minicpm4v3b_vision_f16_b1Input instances to obtain predictions from
+    @param inputArray array of ane_minicpmv4_vit_f16Input instances to obtain predictions from
     @param options prediction options
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the predictions as NSArray<ane_minicpm4v3b_vision_f16_b1Output *>
+    @return the predictions as NSArray<ane_minicpmv4_vit_f16Output *>
 */
-- (nullable NSArray<ane_minicpm4v3b_vision_f16_b1Output *> *)predictionsFromInputs:(NSArray<ane_minicpm4v3b_vision_f16_b1Input*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+- (nullable NSArray<ane_minicpmv4_vit_f16Output *> *)predictionsFromInputs:(NSArray<ane_minicpmv4_vit_f16Input*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.m b/tools/mtmd/ane/ane_minicpmv4_vit_f16.m
similarity index 63%
rename from tools/mtmd/ane_minicpm4v3b_vision_f16_b1.m
rename to tools/mtmd/ane/ane_minicpmv4_vit_f16.m
index 14cb4e84ee169..75337a7b5b150 100644
--- a/tools/mtmd/ane_minicpm4v3b_vision_f16_b1.m
+++ b/tools/mtmd/ane/ane_minicpmv4_vit_f16.m
@@ -1,5 +1,5 @@
 //
-// ane_minicpm4v3b_vision_f16_b1.m
+// ane_minicpmv4_vit_f16.m
 //
 // This file was automatically generated and should not be edited.
 //
@@ -8,9 +8,9 @@
 #error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
 #endif
 
-#import "ane_minicpm4v3b_vision_f16_b1.h"
+#import "ane_minicpmv4_vit_f16.h"
 
-@implementation ane_minicpm4v3b_vision_f16_b1Input
+@implementation ane_minicpmv4_vit_f16Input
 
 - (instancetype)initWithInput:(MLMultiArray *)input {
     self = [super init];
@@ -33,7 +33,7 @@ - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
 
 @end
 
-@implementation ane_minicpm4v3b_vision_f16_b1Output
+@implementation ane_minicpmv4_vit_f16Output
 
 - (instancetype)initWithOutput:(MLMultiArray *)output {
     self = [super init];
@@ -56,23 +56,23 @@ - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
 
 @end
 
-@implementation ane_minicpm4v3b_vision_f16_b1
+@implementation ane_minicpmv4_vit_f16
 
 
 /**
     URL of the underlying .mlmodelc directory.
 */
 + (nullable NSURL *)URLOfModelInThisBundle {
-    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"ane_minicpm4v3b_vision_f16_b1" ofType:@"mlmodelc"];
-    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load ane_minicpm4v3b_vision_f16_b1.mlmodelc in the bundle resource"); return nil; }
+    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"ane_minicpmv4_vit_f16" ofType:@"mlmodelc"];
+    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load ane_minicpmv4_vit_f16.mlmodelc in the bundle resource"); return nil; }
     return [NSURL fileURLWithPath:assetPath];
 }
 
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance from an existing MLModel object.
+    Initialize ane_minicpmv4_vit_f16 instance from an existing MLModel object.
 
-    Usually the application does not use this initializer unless it makes a subclass of ane_minicpm4v3b_vision_f16_b1.
+    Usually the application does not use this initializer unless it makes a subclass of ane_minicpmv4_vit_f16.
     Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model {
@@ -88,7 +88,7 @@ - (instancetype)initWithMLModel:(MLModel *)model {
 
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance with the model in this bundle.
+    Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle.
 */
 - (nullable instancetype)init {
     return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
@@ -96,7 +96,7 @@ - (nullable instancetype)init {
 
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance with the model in this bundle.
+    Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle.
 
     @param configuration The model configuration object
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@@ -107,9 +107,9 @@ - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configura
 
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance from the model URL.
+    Initialize ane_minicpmv4_vit_f16 instance from the model URL.
 
-    @param modelURL URL to the .mlmodelc directory for ane_minicpm4v3b_vision_f16_b1.
+    @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16.
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
@@ -120,9 +120,9 @@ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError
 
 
 /**
-    Initialize ane_minicpm4v3b_vision_f16_b1 instance from the model URL.
+    Initialize ane_minicpmv4_vit_f16 instance from the model URL.
 
-    @param modelURL URL to the .mlmodelc directory for ane_minicpm4v3b_vision_f16_b1.
+    @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16.
     @param configuration The model configuration object
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
@@ -134,13 +134,13 @@ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(
 
 
 /**
-    Construct ane_minicpm4v3b_vision_f16_b1 instance asynchronously with configuration.
+    Construct ane_minicpmv4_vit_f16 instance asynchronously with configuration.
     Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
 
     @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpm4v3b_vision_f16_b1 instance or NSError object.
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object.
 */
-+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1 * _Nullable model, NSError * _Nullable error))handler {
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler {
     [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
               configuration:configuration
           completionHandler:handler];
@@ -148,20 +148,20 @@ + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHa
 
 
 /**
-    Construct ane_minicpm4v3b_vision_f16_b1 instance asynchronously with URL of .mlmodelc directory and optional configuration.
+    Construct ane_minicpmv4_vit_f16 instance asynchronously with URL of .mlmodelc directory and optional configuration.
 
     Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
 
     @param modelURL The model URL.
     @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpm4v3b_vision_f16_b1 instance or NSError object.
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object.
 */
-+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1 * _Nullable model, NSError * _Nullable error))handler {
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler {
     [MLModel loadContentsOfURL:modelURL
                  configuration:configuration
              completionHandler:^(MLModel *model, NSError *error) {
         if (model != nil) {
-            ane_minicpm4v3b_vision_f16_b1 *typedModel = [[ane_minicpm4v3b_vision_f16_b1 alloc] initWithMLModel:model];
+            ane_minicpmv4_vit_f16 *typedModel = [[ane_minicpmv4_vit_f16 alloc] initWithMLModel:model];
             handler(typedModel, nil);
         } else {
             handler(nil, error);
@@ -169,20 +169,20 @@ + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration
     }];
 }
 
-- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
     return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
 }
 
-- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
     id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
     if (!outFeatures) { return nil; }
-    return [[ane_minicpm4v3b_vision_f16_b1Output alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
+    return [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
 }
 
-- (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1Output * _Nullable output, NSError * _Nullable error))completionHandler {
+- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler {
     [self.model predictionFromFeatures:input completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
         if (prediction != nil) {
-            ane_minicpm4v3b_vision_f16_b1Output *output = [[ane_minicpm4v3b_vision_f16_b1Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
+            ane_minicpmv4_vit_f16Output *output = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
             completionHandler(output, predictionError);
         } else {
             completionHandler(nil, predictionError);
@@ -190,10 +190,10 @@ - (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input compl
     }];
 }
 
-- (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpm4v3b_vision_f16_b1Output * _Nullable output, NSError * _Nullable error))completionHandler {
+- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler {
     [self.model predictionFromFeatures:input options:options completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
         if (prediction != nil) {
-            ane_minicpm4v3b_vision_f16_b1Output *output = [[ane_minicpm4v3b_vision_f16_b1Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
+            ane_minicpmv4_vit_f16Output *output = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
             completionHandler(output, predictionError);
         } else {
             completionHandler(nil, predictionError);
@@ -201,19 +201,19 @@ - (void)predictionFromFeatures:(ane_minicpm4v3b_vision_f16_b1Input *)input optio
     }];
 }
 
-- (nullable ane_minicpm4v3b_vision_f16_b1Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    ane_minicpm4v3b_vision_f16_b1Input *input_ = [[ane_minicpm4v3b_vision_f16_b1Input alloc] initWithInput:input];
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    ane_minicpmv4_vit_f16Input *input_ = [[ane_minicpmv4_vit_f16Input alloc] initWithInput:input];
     return [self predictionFromFeatures:input_ error:error];
 }
 
-- (nullable NSArray<ane_minicpm4v3b_vision_f16_b1Output *> *)predictionsFromInputs:(NSArray<ane_minicpm4v3b_vision_f16_b1Input*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+- (nullable NSArray<ane_minicpmv4_vit_f16Output *> *)predictionsFromInputs:(NSArray<ane_minicpmv4_vit_f16Input*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
     id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
     id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
     if (!outBatch) { return nil; }
-    NSMutableArray<ane_minicpm4v3b_vision_f16_b1Output*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
+    NSMutableArray<ane_minicpmv4_vit_f16Output*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
     for (NSInteger i = 0; i < outBatch.count; i++) {
         id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
-        ane_minicpm4v3b_vision_f16_b1Output * result = [[ane_minicpm4v3b_vision_f16_b1Output alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
+        ane_minicpmv4_vit_f16Output * result = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
         [results addObject:result];
     }
     return results;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 5b8285190d7e7..07adb02543323 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -10,7 +10,9 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
-#include "ane.h"
+#ifdef __APPLE__
+#include "ane/ane.h"
+#endif
 
 #include <cassert>
 #include <cmath>
@@ -3186,7 +3188,7 @@ struct llava_uhd {
         bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
         const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
 
-        has_slices = false;
+        // has_slices = false;
         if (!has_slices) {
             // skip slicing logic
             res.overview_size = clip_image_size{slice_size, slice_size};
@@ -3800,6 +3802,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
     return pos_embed_2d;
 }
 
+#ifdef __APPLE__
 static bool clip_image_encode_ane(float * data, float * vec) {
 
     static int flag = 0;
@@ -3810,6 +3813,7 @@ static bool clip_image_encode_ane(float * data, float * vec) {
     }
     predictWith(coremlEncoder, data, vec);
 }
+#endif
 
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     clip_image_f32_batch imgs;
@@ -3817,6 +3821,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
     *img_copy = *img;
     imgs.entries.push_back(std::move(img_copy));
 
+#ifdef __APPLE__
     bool ios_ctx = true;
     if (ios_ctx){
         printf("clip use ane\n");
@@ -3830,10 +3835,12 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
         free(vit_embedding2);
         return true;
     }
+#endif
 
     return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
 
+#ifdef __APPLE__
 bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
@@ -4137,7 +4144,7 @@ bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_bat
 
     return true;
 }
-
+#endif
 
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
index de08bb5000ceb..d5a83dcf5647e 100644
--- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
+++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -767,13 +767,17 @@ def _replace_name(s, v):
     if re.match("vision_model.embeddings.position_embedding", s):
         v = v.unsqueeze(0)
         return {s: v}
-
-    return {s: v}
+    print(s)
+    if "emb" in s:
+        return {s: v}
+    return None
 
 state_dict = model.state_dict()
 new_state_dict = {}
 for k, v in state_dict.items():
     kvs = _replace_name(k, v)
+    if kvs is None:
+        continue
     for nk, nv in kvs.items():
         new_state_dict[nk] = nv
 state_dict = new_state_dict
diff --git a/tools/mtmd/mtmd-ios-example.cpp b/tools/mtmd/mtmd-ios-example.cpp
index f4ba74c0ff0e3..fd7503b5f0d7c 100644
--- a/tools/mtmd/mtmd-ios-example.cpp
+++ b/tools/mtmd/mtmd-ios-example.cpp
@@ -5,8 +5,8 @@
 
 void example_multiple_images_progressive() {
     mtmd_ios_params params = mtmd_ios_params_default();
-    params.model_path = "/Users/tianchi/code/project/4o/3b/MiniCPM-4v-3b/model/ggml-model-Q4_0.gguf";
-    params.mmproj_path = "/Users/tianchi/code/project/4o/3b/MiniCPM-4v-3b/mmproj-model-f16.gguf";
+    params.model_path = "/Users/tianchi/code/tc_mb/deployment/gguf/MiniCPM-V-4-gguf/ggml-model-Q4_0.gguf";
+    params.mmproj_path = "/Users/tianchi/code/tc_mb/deployment/gguf/MiniCPM-V-4-gguf/mmproj-model-f16.gguf";
     params.n_predict = 100;  // 增加生成长度
     params.temperature = 0.6f;
     

From 864d0130cce25782edb53fe3eedcf0e84bb7dfca Mon Sep 17 00:00:00 2001
From: tc-mb <caitianchi@modelbest.cn>
Date: Tue, 5 Aug 2025 20:28:44 +0800
Subject: [PATCH 11/15] update comments

---
 tools/mtmd/CMakeLists.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index c2643875664e0..c9c916f61ef80 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -15,7 +15,7 @@ add_library(mtmd
             mtmd-ios.h
             )
 
-# 在Apple平台上添加ANE相关文件
+# Add ANE related files on Apple platforms
 if(APPLE)
     target_sources(mtmd PRIVATE
         ane/ane.h
@@ -24,7 +24,7 @@ if(APPLE)
         ane/ane_minicpmv4_vit_f16.m
     )
     
-    # 为Objective-C文件启用ARC
+    # Enable ARC for Objective-C files
     set_source_files_properties(ane/ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
     set_source_files_properties(ane/ane_minicpmv4_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
 endif()
@@ -40,7 +40,7 @@ target_include_directories(mtmd PRIVATE ../../src)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
-# 在Apple平台上链接CoreML和Accelerate框架
+# Link CoreML and Accelerate frameworks on Apple platforms
 if(APPLE)
     target_link_libraries(mtmd PRIVATE 
         "-framework Foundation" 
@@ -62,7 +62,7 @@ set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-ios.h
     )
 
-# 在Apple平台上添加ANE公共头文件
+# Add ANE public headers on Apple platforms
 if(APPLE)
     list(APPEND MTMD_PUBLIC_HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/ane/ane.h
@@ -97,7 +97,7 @@ install                (TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
-# iOS接口测试
+# iOS interface testing
 set(TARGET_IOS llama-mtmd-ios-test)
 add_executable         (${TARGET_IOS} mtmd-ios-example.cpp)
 set_target_properties  (${TARGET_IOS} PROPERTIES OUTPUT_NAME llama-mtmd-ios-test)
@@ -111,13 +111,13 @@ target_include_directories(${TARGET_IOS} PRIVATE ../../vendor)
 target_link_libraries  (${TARGET_IOS} PRIVATE mtmd common llama ggml Threads::Threads)
 target_compile_features(${TARGET_IOS} PRIVATE cxx_std_17)
 
-# 确保依赖库先构建
+# Ensure dependent libraries are built first
 add_dependencies(${TARGET_IOS} mtmd)
 if (TARGET common)
     add_dependencies(${TARGET_IOS} common)
 endif()
 
-# 在Apple平台上为iOS测试添加额外的链接设置
+# Add additional linking settings for iOS testing on Apple platforms
 if(APPLE)
     target_link_libraries(${TARGET_IOS} PRIVATE 
         "-framework Foundation" 

From 54258e9cf66bbd429245492874f85f68888fd8d2 Mon Sep 17 00:00:00 2001
From: tc-mb <caitianchi@modelbest.cn>
Date: Tue, 12 Aug 2025 16:02:25 +0800
Subject: [PATCH 12/15] optimized interface

---
 common/arg.cpp            |  8 ++++++++
 common/common.h           |  3 +++
 tools/mtmd/CMakeLists.txt | 19 +++++++++++--------
 tools/mtmd/ane/ane.h      |  2 +-
 tools/mtmd/ane/ane.mm     | 36 ++++++++++++++++++++++++------------
 tools/mtmd/clip.cpp       | 29 +++++++++++++++++++++++++----
 tools/mtmd/clip.h         |  3 +++
 tools/mtmd/mtmd-cli.cpp   |  1 +
 tools/mtmd/mtmd-ios.cpp   |  5 +++++
 tools/mtmd/mtmd-ios.h     |  1 +
 tools/mtmd/mtmd.cpp       |  6 ++++++
 tools/mtmd/mtmd.h         |  1 +
 12 files changed, 89 insertions(+), 25 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 40af7e574830f..7d56c683d1b38 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -960,6 +960,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
                 common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
+                common_params_handle_model(params.ane,       params.hf_token, "", params.offline);
                 break;
             }
         }
@@ -2243,6 +2244,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.mmproj_use_gpu = false;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    add_opt(common_arg(
+        {"--ane"}, "FILE",
+        "path to Apple Neural Engine model file for iOS",
+        [](common_params & params, const std::string & value) {
+            params.ane.path = value;
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_ANE"));
     add_opt(common_arg(
         {"--image", "--audio"}, "FILE",
         "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
diff --git a/common/common.h b/common/common.h
index 8922090e7b10d..473aa28be62ad 100644
--- a/common/common.h
+++ b/common/common.h
@@ -353,6 +353,9 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+    
+    // Apple Neural Engine support
+    struct common_params_model ane;
 
     // embedding
     bool embedding         = false; // get only sentence embedding
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index c9c916f61ef80..94b8fc009876a 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -2,6 +2,9 @@
 
 find_package(Threads REQUIRED)
 
+# ANE support option
+option(ENABLE_ANE "Enable Apple Neural Engine support" OFF)
+
 add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
@@ -15,8 +18,8 @@ add_library(mtmd
             mtmd-ios.h
             )
 
-# Add ANE related files on Apple platforms
-if(APPLE)
+# Add ANE related files when enabled
+if(ENABLE_ANE)
     target_sources(mtmd PRIVATE
         ane/ane.h
         ane/ane.mm
@@ -40,8 +43,8 @@ target_include_directories(mtmd PRIVATE ../../src)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
-# Link CoreML and Accelerate frameworks on Apple platforms
-if(APPLE)
+# Link CoreML and Accelerate frameworks when ANE is enabled
+if(ENABLE_ANE)
     target_link_libraries(mtmd PRIVATE 
         "-framework Foundation" 
         "-framework CoreML" 
@@ -62,8 +65,8 @@ set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-ios.h
     )
 
-# Add ANE public headers on Apple platforms
-if(APPLE)
+# Add ANE public headers when enabled
+if(ENABLE_ANE)
     list(APPEND MTMD_PUBLIC_HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/ane/ane.h
     )
@@ -117,8 +120,8 @@ if (TARGET common)
     add_dependencies(${TARGET_IOS} common)
 endif()
 
-# Add additional linking settings for iOS testing on Apple platforms
-if(APPLE)
+# Add additional linking settings for iOS testing when ANE is enabled
+if(ENABLE_ANE)
     target_link_libraries(${TARGET_IOS} PRIVATE 
         "-framework Foundation" 
         "-framework CoreML" 
diff --git a/tools/mtmd/ane/ane.h b/tools/mtmd/ane/ane.h
index 7fe7ad5c9a347..e129122de2f35 100644
--- a/tools/mtmd/ane/ane.h
+++ b/tools/mtmd/ane/ane.h
@@ -2,7 +2,7 @@
 extern "C" {
 #endif
 
-const void* loadModel();
+const void* loadModel(const char* model_path);
 void closeModel(const void* model);
 void predictWith(const void* model, float* embed, float* encoderOutput);
 
diff --git a/tools/mtmd/ane/ane.mm b/tools/mtmd/ane/ane.mm
index 0063604341a0d..1c86f36df9dfd 100644
--- a/tools/mtmd/ane/ane.mm
+++ b/tools/mtmd/ane/ane.mm
@@ -8,18 +8,30 @@
 extern "C" {
 #endif
 
-const void* loadModel() {
-    // 新的，从 documents directionary 中加载 begin
-    // 获取文件管理器实例
-    NSFileManager *fileManager = [NSFileManager defaultManager];
-    // 获取应用的 Documents 目录的 URL
-    NSURL *documentsURL = [[fileManager URLsForDirectory:NSDocumentDirectory inDomains:NSUserDomainMask] firstObject];
-    NSString *pathString = [documentsURL.absoluteString stringByAppendingString:@"ane_minicpmv4_vit_f16.mlmodelc"];
-    NSURL *modelURL = [NSURL URLWithString:pathString];
-
-    NSLog(modelURL.absoluteString);
-
-    const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:nil]);
+const void* loadModel(const char* model_path) {
+    if (!model_path) {
+        NSLog(@"Error: model_path is null");
+        return nullptr;
+    }
+    
+    NSString *pathString = [NSString stringWithUTF8String:model_path];
+    NSURL *modelURL = [NSURL fileURLWithPath:pathString];
+    
+    NSLog(@"Loading ANE model from: %@", modelURL.absoluteString);
+    
+    NSError *error = nil;
+    const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:&error]);
+    
+    if (error) {
+        NSLog(@"Error loading ANE model: %@", error.localizedDescription);
+        return nullptr;
+    }
+    
+    if (!model) {
+        NSLog(@"Error: Failed to create ANE model instance");
+        return nullptr;
+    }
+    
     return model;
 }
 
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 07adb02543323..72f62387584cc 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -380,6 +380,9 @@ struct clip_ctx {
     // for debugging
     bool debug_graph = false;
     std::vector<ggml_tensor *> debug_print_tensors;
+    
+    // ANE model path for iOS
+    std::string ane_model_path;
 
     clip_ctx(clip_context_params & ctx_params) {
         debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
@@ -3803,15 +3806,27 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
 }
 
 #ifdef __APPLE__
-static bool clip_image_encode_ane(float * data, float * vec) {
+static bool clip_image_encode_ane(float * data, float * vec, const char* ane_model_path) {
 
     static int flag = 0;
     static const void* coremlEncoder = NULL;
-    if (flag == 0) {
-        coremlEncoder = loadModel();
+    static std::string cached_model_path = "";
+    
+    // Check if we need to load a new model
+    if (flag == 0 || (ane_model_path && cached_model_path != ane_model_path)) {
+        if (coremlEncoder) {
+            closeModel(coremlEncoder);
+        }
+        coremlEncoder = loadModel(ane_model_path);
+        if (!coremlEncoder) {
+            printf("Failed to load ANE model from: %s\n", ane_model_path ? ane_model_path : "null");
+            return false;
+        }
+        cached_model_path = ane_model_path ? ane_model_path : "";
         flag = 1;
     }
     predictWith(coremlEncoder, data, vec);
+    return true;
 }
 #endif
 
@@ -3829,7 +3844,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
         float * vit_embedding2 = (float *)malloc(1100*1152*sizeof(float));
 
         ane_embedding(ctx, n_threads, &imgs, vit_embedding1);
-        clip_image_encode_ane(vit_embedding1, vit_embedding2);
+        clip_image_encode_ane(vit_embedding1, vit_embedding2, ctx->ane_model_path.c_str());
         ane_resampler(ctx, n_threads, &imgs, vit_embedding2, vec);
         free(vit_embedding1);
         free(vit_embedding2);
@@ -4634,3 +4649,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
     batch->entries.push_back(clip_image_f32_ptr(audio));
     batch->is_audio = true;
 }
+
+void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path) {
+    if (ctx && ane_model_path) {
+        ctx->ane_model_path = ane_model_path;
+    }
+}
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 25ad15fd53687..3416839d7e72f 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -112,3 +112,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
+
+// ANE support functions
+void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path);
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 599e682e0f894..b2bc16b450116 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -132,6 +132,7 @@ struct mtmd_cli_context {
         mparams.print_timings = true;
         mparams.n_threads = params.cpuparams.n_threads;
         mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
+        mparams.ane_model_path = params.ane.path.empty() ? nullptr : params.ane.path.c_str();
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
diff --git a/tools/mtmd/mtmd-ios.cpp b/tools/mtmd/mtmd-ios.cpp
index bf8de2ac1cb65..554235d02c9ca 100644
--- a/tools/mtmd/mtmd-ios.cpp
+++ b/tools/mtmd/mtmd-ios.cpp
@@ -61,11 +61,14 @@ mtmd_ios_params mtmd_ios_params_default(void) {
     mtmd_ios_params params = {};
     params.model_path = "";
     params.mmproj_path = "";
+    params.ane_path = "";
     params.n_predict = -1;
     params.n_ctx = 4096;
     params.n_threads = 4;
     params.temperature = 0.2f;
     params.use_gpu = true;
+    params.mmproj_use_gpu = true;
+    params.warmup = true;
     return params;
 }
 
@@ -86,6 +89,7 @@ mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
     common_params common_params;
     common_params.model.path = params->model_path;
     common_params.mmproj.path = params->mmproj_path;
+    common_params.ane.path = params->ane_path;
     common_params.n_ctx = params->n_ctx;
     common_params.n_batch = 2048;  // 增加batch大小，与标准mtmd保持一致
     common_params.cpuparams.n_threads = params->n_threads;
@@ -132,6 +136,7 @@ mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
     mparams.print_timings = false;
     mparams.n_threads = params->n_threads;
     mparams.verbosity = GGML_LOG_LEVEL_INFO;
+    mparams.ane_model_path = params->ane_path.empty() ? nullptr : params->ane_path.c_str();
     
     ctx->ctx_vision.reset(mtmd_init_from_file(params->mmproj_path.c_str(), ctx->model, mparams));
     if (!ctx->ctx_vision.get()) {
diff --git a/tools/mtmd/mtmd-ios.h b/tools/mtmd/mtmd-ios.h
index 92a1f0e47c627..e8ed92f3f59bf 100644
--- a/tools/mtmd/mtmd-ios.h
+++ b/tools/mtmd/mtmd-ios.h
@@ -15,6 +15,7 @@ typedef struct mtmd_ios_context mtmd_ios_context;
 typedef struct mtmd_ios_params {
     std::string model_path;
     std::string mmproj_path;
+    std::string ane_path;
     int         n_predict;
     int         n_ctx;
     int         n_threads;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 66553f838bd86..c800df7457e04 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -91,6 +91,7 @@ mtmd_context_params mtmd_context_params_default() {
     params.verbosity = GGML_LOG_LEVEL_INFO;
     params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
     params.media_marker = mtmd_default_marker();
+    params.ane_model_path = nullptr;
     return params;
 }
 
@@ -155,6 +156,11 @@ struct mtmd_context {
         auto res = clip_init(mmproj_fname, ctx_clip_params);
         ctx_v = res.ctx_v;
         ctx_a = res.ctx_a;
+        
+        // Set ANE model path for iOS
+        if (ctx_params.ane_model_path && ctx_v) {
+            clip_set_ane_model_path(ctx_v, ctx_params.ane_model_path);
+        }
         if (!ctx_v && !ctx_a) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index f4ea07d3ad521..331992e76f43e 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -82,6 +82,7 @@ struct mtmd_context_params {
     enum ggml_log_level verbosity;
     const char * image_marker; // deprecated, use media_marker instead
     const char * media_marker;
+    const char * ane_model_path; // path to ANE model for iOS
 };
 
 MTMD_API const char * mtmd_default_marker(void);

From 629b625ae86a913feab1852a42a6d8c492157d4d Mon Sep 17 00:00:00 2001
From: tc-mb <caitianchi@modelbest.cn>
Date: Tue, 12 Aug 2025 16:13:19 +0800
Subject: [PATCH 13/15] merge ane first, temp rm app support

---
 tools/mtmd/CMakeLists.txt       |  33 ----
 tools/mtmd/mtmd-ios-example.cpp | 121 ------------
 tools/mtmd/mtmd-ios.cpp         | 315 --------------------------------
 tools/mtmd/mtmd-ios.h           |  88 ---------
 4 files changed, 557 deletions(-)
 delete mode 100644 tools/mtmd/mtmd-ios-example.cpp
 delete mode 100644 tools/mtmd/mtmd-ios.cpp
 delete mode 100644 tools/mtmd/mtmd-ios.h

diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 94b8fc009876a..7a454ad1ad4df 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -14,8 +14,6 @@ add_library(mtmd
             clip-impl.h
             mtmd-helper.cpp
             mtmd-helper.h
-            mtmd-ios.cpp
-            mtmd-ios.h
             )
 
 # Add ANE related files when enabled
@@ -62,7 +60,6 @@ endif()
 set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-ios.h
     )
 
 # Add ANE public headers when enabled
@@ -99,33 +96,3 @@ set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
 install                (TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-# iOS interface testing
-set(TARGET_IOS llama-mtmd-ios-test)
-add_executable         (${TARGET_IOS} mtmd-ios-example.cpp)
-set_target_properties  (${TARGET_IOS} PROPERTIES OUTPUT_NAME llama-mtmd-ios-test)
-target_include_directories(${TARGET_IOS} PRIVATE .)
-target_include_directories(${TARGET_IOS} PRIVATE ../..)
-target_include_directories(${TARGET_IOS} PRIVATE ../../common)
-target_include_directories(${TARGET_IOS} PRIVATE ../../include)
-target_include_directories(${TARGET_IOS} PRIVATE ../../ggml/include)
-target_include_directories(${TARGET_IOS} PRIVATE ../../src)
-target_include_directories(${TARGET_IOS} PRIVATE ../../vendor)
-target_link_libraries  (${TARGET_IOS} PRIVATE mtmd common llama ggml Threads::Threads)
-target_compile_features(${TARGET_IOS} PRIVATE cxx_std_17)
-
-# Ensure dependent libraries are built first
-add_dependencies(${TARGET_IOS} mtmd)
-if (TARGET common)
-    add_dependencies(${TARGET_IOS} common)
-endif()
-
-# Add additional linking settings for iOS testing when ANE is enabled
-if(ENABLE_ANE)
-    target_link_libraries(${TARGET_IOS} PRIVATE 
-        "-framework Foundation" 
-        "-framework CoreML" 
-        "-framework Accelerate"
-        "-ObjC"
-    )
-endif()
diff --git a/tools/mtmd/mtmd-ios-example.cpp b/tools/mtmd/mtmd-ios-example.cpp
deleted file mode 100644
index fd7503b5f0d7c..0000000000000
--- a/tools/mtmd/mtmd-ios-example.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-#include "mtmd-ios.h"
-#include <iostream>
-#include <string>
-#include <cstdlib>
-
-void example_multiple_images_progressive() {
-    mtmd_ios_params params = mtmd_ios_params_default();
-    params.model_path = "/Users/tianchi/code/tc_mb/deployment/gguf/MiniCPM-V-4-gguf/ggml-model-Q4_0.gguf";
-    params.mmproj_path = "/Users/tianchi/code/tc_mb/deployment/gguf/MiniCPM-V-4-gguf/mmproj-model-f16.gguf";
-    params.n_predict = 100;  // 增加生成长度
-    params.temperature = 0.6f;
-    
-    mtmd_ios_context* ctx = mtmd_ios_init(&params);
-    if (!ctx) {
-        std::cerr << "Failed to initialize context\n";
-        return;
-    }
-    
-    std::cout << "=== 多轮多模态对话示例 ===\n";
-    std::cout << "命令说明：\n";
-    std::cout << "  /image <路径>  - 添加图片\n";
-    std::cout << "  /text <内容>   - 添加文本\n";
-    std::cout << "  /generate     - 生成响应\n";
-    std::cout << "  /quit         - 退出\n";
-    std::cout << "=============================\n\n";
-    
-    std::string input;
-    bool has_content = false;  // 跟踪是否有内容可以生成
-    
-    while (true) {
-        std::cout << "> ";
-        std::getline(std::cin, input);
-        
-        if (input.empty()) {
-            continue;
-        }
-        
-        if (input == "/quit") {
-            break;
-        }
-        
-        if (input == "/generate") {
-            if (!has_content) {
-                std::cout << "请先添加图片或文本内容\n";
-                continue;
-            }
-            
-            std::cout << "Assistant: ";
-            int token_count = 0;
-            while (true) {
-                mtmd_ios_token result = mtmd_ios_loop(ctx);
-                
-                if (result.is_end) {
-                    std::cout << "\n[生成完成 - " << token_count << " tokens]\n\n";
-                    break;
-                }
-                
-                if (result.token) {
-                    std::cout << result.token;
-                    std::cout.flush();
-                    mtmd_ios_string_free(result.token);
-                    token_count++;
-                }
-            }  
-            
-            has_content = false;  // 重置内容标志
-            continue;
-        }
-        
-        if (input.find("/image ") == 0) {
-            std::string image_path = input.substr(7);
-            if (image_path.empty()) {
-                std::cout << "请提供图片路径\n";
-                continue;
-            }
-            
-            std::cout << "正在加载图片: " << image_path << "\n";
-            if (mtmd_ios_prefill_image(ctx, image_path.c_str()) != 0) {
-                std::cerr << "Failed to load image: " << mtmd_ios_get_last_error(ctx) << "\n";
-            } else {
-                std::cout << "图片加载成功\n";
-                has_content = true;
-            }
-            continue;
-        }
-        
-        if (input.find("/text ") == 0) {
-            std::string text = input.substr(6);
-            if (text.empty()) {
-                std::cout << "请提供文本内容\n";
-                continue;
-            }
-            
-            std::cout << "正在添加文本: " << text << "\n";
-            if (mtmd_ios_prefill_text(ctx, text.c_str(), "user") != 0) {
-                std::cerr << "Failed to add text: " << mtmd_ios_get_last_error(ctx) << "\n";
-            } else {
-                std::cout << "文本添加成功\n";
-                has_content = true;
-            }
-            continue;
-        }
-        
-        // 如果不是命令，当作文本处理
-        std::cout << "正在添加文本: " << input << "\n";
-        if (mtmd_ios_prefill_text(ctx, input.c_str(), "user") != 0) {
-            std::cerr << "Failed to add text: " << mtmd_ios_get_last_error(ctx) << "\n";
-        } else {
-            std::cout << "文本添加成功\n";
-            has_content = true;
-        }
-    }
-    
-    std::cout << "对话结束\n";
-    mtmd_ios_free(ctx);
-}
-
-int main() {
-    example_multiple_images_progressive(); 
-    return 0;
-} 
\ No newline at end of file
diff --git a/tools/mtmd/mtmd-ios.cpp b/tools/mtmd/mtmd-ios.cpp
deleted file mode 100644
index 554235d02c9ca..0000000000000
--- a/tools/mtmd/mtmd-ios.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-#include "mtmd-ios.h"
-#include "arg.h"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include "llama.h"
-#include "ggml.h"
-#include "chat.h"
-#include "mtmd.h"
-#include "mtmd-helper.h"
-
-#include <vector>
-#include <string>
-#include <limits.h>
-#include <cinttypes>
-#include <memory>
-#include <cstring>
-#include <cstdlib>
-#include <iostream>
-
-struct mtmd_ios_context {
-    mtmd::context_ptr ctx_vision;
-    common_init_result llama_init;
-    
-    llama_model* model;
-    llama_context* lctx;
-    const llama_vocab* vocab;
-    common_sampler* smpl;
-    llama_batch batch;
-    
-    mtmd::bitmaps bitmaps;
-    common_chat_templates_ptr tmpls;
-    
-    int n_threads;
-    llama_pos n_past;
-    int n_predict;
-    
-    std::string last_error;
-    
-    ~mtmd_ios_context() {
-        if (batch.token) {
-            llama_batch_free(batch);
-        }
-        if (smpl) {
-            common_sampler_free(smpl);
-        }
-    }
-};
-
-void mtmd_ios_string_free(char* str) {
-    if (str) {
-        free(str);
-    }
-}
-
-static void set_error(mtmd_ios_context* ctx, const std::string& error) {
-    ctx->last_error = error;
-}
-
-mtmd_ios_params mtmd_ios_params_default(void) {
-    mtmd_ios_params params = {};
-    params.model_path = "";
-    params.mmproj_path = "";
-    params.ane_path = "";
-    params.n_predict = -1;
-    params.n_ctx = 4096;
-    params.n_threads = 4;
-    params.temperature = 0.2f;
-    params.use_gpu = true;
-    params.mmproj_use_gpu = true;
-    params.warmup = true;
-    return params;
-}
-
-mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
-    if (!params || params->model_path.empty() || params->mmproj_path.empty()) {
-        return nullptr;
-    }
-    
-    ggml_time_init();
-    common_init();
-    
-    auto ctx = std::make_unique<mtmd_ios_context>();
-    
-    ctx->n_predict = params->n_predict;
-    ctx->n_threads = params->n_threads;
-    ctx->n_past = 0;
-    
-    common_params common_params;
-    common_params.model.path = params->model_path;
-    common_params.mmproj.path = params->mmproj_path;
-    common_params.ane.path = params->ane_path;
-    common_params.n_ctx = params->n_ctx;
-    common_params.n_batch = 2048;  // 增加batch大小，与标准mtmd保持一致
-    common_params.cpuparams.n_threads = params->n_threads;
-    common_params.sampling.temp = params->temperature;
-    common_params.mmproj_use_gpu = params->mmproj_use_gpu;
-
-    ctx->llama_init = common_init_from_params(common_params);
-    
-    ctx->model = ctx->llama_init.model.get();
-    ctx->lctx = ctx->llama_init.context.get();
-    
-    if (!ctx->model || !ctx->lctx) {
-        set_error(ctx.get(), "Failed to load model or create context");
-        return nullptr;
-    }
-    
-    ctx->vocab = llama_model_get_vocab(ctx->model);
-    
-    ctx->smpl = common_sampler_init(ctx->model, common_params.sampling);
-    if (!ctx->smpl) {
-        set_error(ctx.get(), "Failed to initialize sampler");
-        return nullptr;
-    }
-    
-    ctx->batch = llama_batch_init(2048, 0, 1);
-    if (!ctx->batch.token) {
-        set_error(ctx.get(), "Failed to initialize batch");
-        return nullptr;
-    }
-    
-    std::string chat_template = "";
-    if (!llama_model_chat_template(ctx->model, nullptr)) {
-        chat_template = "chatml";
-    }
-    
-    ctx->tmpls = common_chat_templates_init(ctx->model, chat_template);
-    if (!ctx->tmpls) {
-        set_error(ctx.get(), "Failed to initialize chat templates");
-        return nullptr;
-    }
-    
-    mtmd_context_params mparams = mtmd_context_params_default();
-    mparams.use_gpu = params->mmproj_use_gpu;
-    mparams.print_timings = false;
-    mparams.n_threads = params->n_threads;
-    mparams.verbosity = GGML_LOG_LEVEL_INFO;
-    mparams.ane_model_path = params->ane_path.empty() ? nullptr : params->ane_path.c_str();
-    
-    ctx->ctx_vision.reset(mtmd_init_from_file(params->mmproj_path.c_str(), ctx->model, mparams));
-    if (!ctx->ctx_vision.get()) {
-        set_error(ctx.get(), "Failed to load vision model from " + std::string(params->mmproj_path));
-        return nullptr;
-    }
-    
-    return ctx.release();
-}
-
-void mtmd_ios_free(mtmd_ios_context* ctx) {
-    if (ctx) {
-        delete ctx;
-    }
-}
-
-int mtmd_ios_prefill_image(mtmd_ios_context* ctx, const std::string& image_path) {
-
-    if (!ctx || image_path.empty()) {
-        return -1;
-    }
-    
-    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx->ctx_vision.get(), image_path.c_str()));
-    if (!bmp.ptr) {
-        set_error(ctx, "Failed to load image from file: " + image_path);
-        return -1;
-    }
-    ctx->bitmaps.entries.push_back(std::move(bmp));
-    
-    mtmd_input_text text;
-    text.text = mtmd_default_marker();
-    text.add_special = ctx->n_past == 0;
-    text.parse_special = true;
-    
-    mtmd::input_chunks chunks(mtmd_input_chunks_init());
-    auto bitmaps_c_ptr = ctx->bitmaps.c_ptr();
-    int32_t res = mtmd_tokenize(ctx->ctx_vision.get(),
-                        chunks.ptr.get(),
-                        &text,
-                        bitmaps_c_ptr.data(),
-                        bitmaps_c_ptr.size());
-    if (res != 0) {
-        set_error(ctx, "Failed to tokenize image");
-        return -1;
-    }
-    
-    ctx->bitmaps.entries.clear();
-    
-    llama_pos new_n_past;
-    if (mtmd_helper_eval_chunks(ctx->ctx_vision.get(),
-                ctx->lctx,
-                chunks.ptr.get(),
-                ctx->n_past,
-                0,
-                1024,
-                false,
-                &new_n_past)) {
-        set_error(ctx, "Failed to eval image");
-        return -1;
-    }
-    
-    ctx->n_past = new_n_past;
-    
-    return 0;
-}
-
-
-
-int mtmd_ios_prefill_text(mtmd_ios_context* ctx, const std::string& text, const std::string& role) {
-    if (!ctx || text.empty() || role.empty()) {
-        return -1;
-    }
-    
-    common_chat_msg msg;
-    msg.role = role.c_str();
-    msg.content = text.c_str();
-    
-    common_chat_templates_inputs tmpl_inputs;
-    tmpl_inputs.messages = {msg};
-    tmpl_inputs.add_generation_prompt = false;
-    tmpl_inputs.use_jinja = false;
-    auto formatted_chat = common_chat_templates_apply(ctx->tmpls.get(), tmpl_inputs);
-    
-    mtmd_input_text input_text;
-    input_text.text = formatted_chat.prompt.c_str();
-    input_text.add_special = ctx->n_past == 0;
-    input_text.parse_special = true;
-    
-    mtmd::input_chunks chunks(mtmd_input_chunks_init());
-    int32_t res = mtmd_tokenize(ctx->ctx_vision.get(),
-                        chunks.ptr.get(),
-                        &input_text,
-                        nullptr,
-                        0);
-    if (res != 0) {
-        set_error(ctx, "Failed to tokenize text");
-        return -1;
-    }
-    
-    llama_pos new_n_past;
-    if (mtmd_helper_eval_chunks(ctx->ctx_vision.get(),
-                ctx->lctx,
-                chunks.ptr.get(),
-                ctx->n_past,
-                0,
-                1024,
-                true,
-                &new_n_past)) {
-        set_error(ctx, "Failed to eval text");
-        return -1;
-    }
-    
-    ctx->n_past = new_n_past;
-    return 0;
-}
-
-
-
-mtmd_ios_token mtmd_ios_loop(mtmd_ios_context* ctx) {
-    mtmd_ios_token result = {nullptr, true};
-    
-    if (!ctx) {
-        return result;
-    }
-    
-    llama_token token_id = common_sampler_sample(ctx->smpl, ctx->lctx, -1);
-    common_sampler_accept(ctx->smpl, token_id, true);
-    
-    if (llama_vocab_is_eog(ctx->vocab, token_id)) {
-        result.is_end = true;
-        return result;
-    }
-    
-    std::string token_str = common_token_to_piece(ctx->lctx, token_id);
-    
-    common_batch_clear(ctx->batch);
-    common_batch_add(ctx->batch, token_id, ctx->n_past, {0}, true);
-    
-    if (ctx->batch.n_tokens > 0) {
-        ctx->batch.logits[ctx->batch.n_tokens - 1] = true;
-    }
-    
-    ctx->n_past++;
-    if (llama_decode(ctx->lctx, ctx->batch)) {
-        set_error(ctx, "failed to decode token");
-        result.is_end = true;
-        return result;
-    }
-    
-    result.token = (char*)malloc(token_str.length() + 1);
-    if (result.token) {
-        strcpy(result.token, token_str.c_str());
-    }
-    result.is_end = false;
-    
-    return result;
-}
-
-const char* mtmd_ios_get_last_error(mtmd_ios_context* ctx) {
-    return ctx ? ctx->last_error.c_str() : nullptr;
-}
-
-bool mtmd_ios_clean_kv_cache(mtmd_ios_context* ctx) {
-    if (!ctx) {
-        return false;
-    }
- 
-    // 清理 kv-cache 并重置序列位置
-    ctx->n_past = 0;
-    llama_kv_self_seq_rm(ctx->lctx, 0, 0, -1);
-
-    // 清理batch状态
-    common_batch_clear(ctx->batch);
-    
-    return true;
-}
\ No newline at end of file
diff --git a/tools/mtmd/mtmd-ios.h b/tools/mtmd/mtmd-ios.h
deleted file mode 100644
index e8ed92f3f59bf..0000000000000
--- a/tools/mtmd/mtmd-ios.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef MTMD_IOS_H
-#define MTMD_IOS_H
-
-#include <string>
-
-#include "ggml.h"
-#include "llama.h"
-#include "mtmd-helper.h"
-#include "mtmd.h"
-
-// Context structure
-typedef struct mtmd_ios_context mtmd_ios_context;
-
-// Parameters structure (C++ only)
-typedef struct mtmd_ios_params {
-    std::string model_path;
-    std::string mmproj_path;
-    std::string ane_path;
-    int         n_predict;
-    int         n_ctx;
-    int         n_threads;
-    float       temperature;
-    bool        use_gpu;
-    bool        mmproj_use_gpu;
-    bool        warmup;
-} mtmd_ios_params;
-
-// Loop return value structure (C++ only)
-typedef struct {
-    char * token;
-    bool   is_end;
-} mtmd_ios_token;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Initialize, returns 0 on success, -1 on failure
-// Parameters:
-// params: parameters
-mtmd_ios_context * mtmd_ios_init(const mtmd_ios_params * params);
-
-// Free resources
-// Parameters:
-// ctx: context
-void mtmd_ios_free(mtmd_ios_context * ctx);
-
-// Get default parameters
-mtmd_ios_params mtmd_ios_params_default(void);
-
-// Prefill image, returns 0 on success, -1 on failure
-// Parameters:
-// ctx: context
-// image_path: image path
-int mtmd_ios_prefill_image(mtmd_ios_context * ctx, const std::string & image_path);
-
-// Prefill text, returns 0 on success, -1 on failure
-// Parameters:
-// ctx: context
-// text: text
-// role: role
-int mtmd_ios_prefill_text(mtmd_ios_context * ctx, const std::string & text, const std::string & role);
-
-// Loop, returns 0 on success, -1 on failure
-// Parameters:
-// ctx: context
-mtmd_ios_token mtmd_ios_loop(mtmd_ios_context * ctx);
-
-// Get last error message
-// Parameters:
-// ctx: context
-const char * mtmd_ios_get_last_error(mtmd_ios_context * ctx);
-
-// Free string
-// Parameters:
-// str: string
-void mtmd_ios_string_free(char * str);
-
-// Clean kv-cache
-// Parameters:
-// ctx: context
-bool mtmd_ios_clean_kv_cache(mtmd_ios_context * ctx);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif

From 701aff9a819273c548b71b1430b9da16ec4010a4 Mon Sep 17 00:00:00 2001
From: tc-mb <caitianchi@modelbest.cn>
Date: Tue, 12 Aug 2025 16:30:39 +0800
Subject: [PATCH 14/15] add file existence check

---
 tools/mtmd/ane/ane.mm | 17 +++++++++++++++++
 tools/mtmd/mtmd.cpp   |  9 +++++++++
 2 files changed, 26 insertions(+)

diff --git a/tools/mtmd/ane/ane.mm b/tools/mtmd/ane/ane.mm
index 1c86f36df9dfd..6b3c538de1d49 100644
--- a/tools/mtmd/ane/ane.mm
+++ b/tools/mtmd/ane/ane.mm
@@ -15,6 +15,22 @@
     }
     
     NSString *pathString = [NSString stringWithUTF8String:model_path];
+    
+    // Check if file exists
+    NSFileManager *fileManager = [NSFileManager defaultManager];
+    if (![fileManager fileExistsAtPath:pathString]) {
+        NSLog(@"Error: ANE model file does not exist at path: %@", pathString);
+        return nullptr;
+    }
+    
+    // Check if it's a directory (for .mlmodelc packages)
+    BOOL isDirectory;
+    if ([fileManager fileExistsAtPath:pathString isDirectory:&isDirectory]) {
+        if (!isDirectory && ![pathString hasSuffix:@".mlmodelc"]) {
+            NSLog(@"Warning: ANE model path should typically be a .mlmodelc directory: %@", pathString);
+        }
+    }
+    
     NSURL *modelURL = [NSURL fileURLWithPath:pathString];
     
     NSLog(@"Loading ANE model from: %@", modelURL.absoluteString);
@@ -32,6 +48,7 @@
         return nullptr;
     }
     
+    NSLog(@"Successfully loaded ANE model from: %@", pathString);
     return model;
 }
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index c800df7457e04..cc90ac560bdb9 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -10,6 +10,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 #include <limits>
 #include <vector>
 
@@ -159,7 +160,15 @@ struct mtmd_context {
         
         // Set ANE model path for iOS
         if (ctx_params.ane_model_path && ctx_v) {
+            // Check if ANE model file exists
+            std::ifstream ane_file(ctx_params.ane_model_path);
+            if (!ane_file.good()) {
+                throw std::runtime_error(string_format("ANE model file does not exist: %s", ctx_params.ane_model_path));
+            }
+            ane_file.close();
+            
             clip_set_ane_model_path(ctx_v, ctx_params.ane_model_path);
+            LOG_INF("ANE model path set to: %s\n", ctx_params.ane_model_path);
         }
         if (!ctx_v && !ctx_a) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));

From 9eee52c801af71990e48ece2d9d390016203b6ea Mon Sep 17 00:00:00 2001
From: tc-mb <caitianchi@modelbest.cn>
Date: Wed, 13 Aug 2025 16:19:45 +0800
Subject: [PATCH 15/15] fix for commit step1

---
 tools/mtmd/CMakeLists.txt |  2 ++
 tools/mtmd/clip.cpp       | 12 ++++++------
 tools/mtmd/clip.h         |  3 ---
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 7a454ad1ad4df..4e46bfd3f55ba 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -24,6 +24,8 @@ if(ENABLE_ANE)
         ane/ane_minicpmv4_vit_f16.h
         ane/ane_minicpmv4_vit_f16.m
     )
+    # Define compile-time macro for code guards
+    target_compile_definitions(mtmd PRIVATE ENABLE_ANE)
     
     # Enable ARC for Objective-C files
     set_source_files_properties(ane/ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 38bc98f103ff7..8091f53bdcbfe 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -10,7 +10,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
-#ifdef __APPLE__
+#if defined(ENABLE_ANE)
 #include "ane/ane.h"
 #endif
 
@@ -3840,7 +3840,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
     return pos_embed_2d;
 }
 
-#ifdef __APPLE__
+#if defined(ENABLE_ANE)
 static bool clip_image_encode_ane(float * data, float * vec, const char* ane_model_path) {
 
     static int flag = 0;
@@ -3871,7 +3871,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
     *img_copy = *img;
     imgs.entries.push_back(std::move(img_copy));
 
-#ifdef __APPLE__
+#if defined(ENABLE_ANE)
     bool ios_ctx = true;
     if (ios_ctx){
         printf("clip use ane\n");
@@ -3890,8 +3890,8 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
     return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
 
-#ifdef __APPLE__
-bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+#if defined(ENABLE_ANE)
+static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
 
@@ -4063,7 +4063,7 @@ bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_bat
     return true;
 }
 
-bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) {
+static bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
 
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 3416839d7e72f..f5524f2e33648 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -95,9 +95,6 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
-bool ane_embedding(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
-bool ane_resampler(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, const float * vit_embedding, float * vec);
-
 int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);