diff --git a/build-xcframework.sh b/build-xcframework.sh
index f813984db9dbd..8a8f2af41df56 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -8,7 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
 
 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_EXAMPLES=OFF
-LLAMA_BUILD_TOOLS=OFF
+LLAMA_BUILD_TOOLS=ON
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
 GGML_METAL=ON
@@ -124,6 +124,10 @@ setup_framework_structure() {
     cp ggml/include/ggml-cpu.h     ${header_path}
     cp ggml/include/ggml-blas.h    ${header_path}
     cp ggml/include/gguf.h         ${header_path}
+    # Copy mtmd-ios headers and dependencies
+    cp tools/mtmd/mtmd-ios.h        ${header_path}
+    cp tools/mtmd/mtmd.h            ${header_path}
+    cp tools/mtmd/mtmd-helper.h     ${header_path}
 
     # Create module map (common for all platforms)
     cat > ${module_path}module.modulemap << EOF
@@ -136,6 +140,9 @@ framework module llama {
     header "ggml-cpu.h"
     header "ggml-blas.h"
     header "gguf.h"
+    header "mtmd-ios.h"
+    header "mtmd.h"
+    header "mtmd-helper.h"
 
     link "c++"
     link framework "Accelerate"
@@ -252,6 +259,8 @@ combine_static_libraries() {
         "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
         "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
         "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
+        "${base_dir}/${build_dir}/common/${release_dir}/libcommon.a"
+        "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
     )
 
     # Create temporary directory for processing
@@ -327,7 +336,7 @@ combine_static_libraries() {
         $arch_flags \
         $min_version_flag \
         -Wl,-force_load,"${temp_dir}/combined.a" \
-        -framework Foundation -framework Metal -framework Accelerate \
+        -framework Foundation -framework Metal -framework Accelerate -framework CoreML \
         -install_name "$install_name" \
         -o "${base_dir}/${output_lib}"
 
diff --git a/common/arg.cpp b/common/arg.cpp
index 3d18aaa171ce4..9725df5021f9b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -961,6 +961,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
                 common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
+                common_params_handle_model(params.ane,       params.hf_token, "", params.offline);
                 break;
             }
         }
@@ -2263,6 +2264,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.mmproj_use_gpu = false;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    add_opt(common_arg(
+        {"--ane"}, "FILE",
+        "path to Apple Neural Engine model file for iOS",
+        [](common_params & params, const std::string & value) {
+            params.ane.path = value;
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_ANE"));
     add_opt(common_arg(
         {"--image", "--audio"}, "FILE",
         "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
diff --git a/common/common.h b/common/common.h
index 5eab199af559e..1167c097d2a6e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -375,6 +375,9 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+    
+    // Apple Neural Engine support
+    struct common_params_model ane;
 
     // embedding
     bool embedding         = false; // get only sentence embedding
diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt
index 68ad707f32c98..b8e652c979f13 100644
--- a/tools/batched-bench/CMakeLists.txt
+++ b/tools/batched-bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/cvector-generator/CMakeLists.txt b/tools/cvector-generator/CMakeLists.txt
index 49ad9561c82ea..e70a76523d8c4 100644
--- a/tools/cvector-generator/CMakeLists.txt
+++ b/tools/cvector-generator/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/export-lora/CMakeLists.txt b/tools/export-lora/CMakeLists.txt
index 310455787a7ef..69330896fd940 100644
--- a/tools/export-lora/CMakeLists.txt
+++ b/tools/export-lora/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/gguf-split/CMakeLists.txt b/tools/gguf-split/CMakeLists.txt
index c407e2f0af44a..bef3fb86f00fd 100644
--- a/tools/gguf-split/CMakeLists.txt
+++ b/tools/gguf-split/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt
index 412696c47c31c..73d7696dce6bb 100644
--- a/tools/imatrix/CMakeLists.txt
+++ b/tools/imatrix/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt
index 17e3b9b87bae4..de81c0bc5d460 100644
--- a/tools/llama-bench/CMakeLists.txt
+++ b/tools/llama-bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt
index af3d9150f8640..f380fcae3c2c8 100644
--- a/tools/main/CMakeLists.txt
+++ b/tools/main/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 4baa15b9609fc..4e46bfd3f55ba 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -2,6 +2,9 @@
 
 find_package(Threads REQUIRED)
 
+# ANE support option
+option(ENABLE_ANE "Enable Apple Neural Engine support" OFF)
+
 add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
@@ -13,13 +16,43 @@ add_library(mtmd
             mtmd-helper.h
             )
 
-target_link_libraries     (mtmd PUBLIC ggml llama)
+# Add ANE related files when enabled
+if(ENABLE_ANE)
+    target_sources(mtmd PRIVATE
+        ane/ane.h
+        ane/ane.mm
+        ane/ane_minicpmv4_vit_f16.h
+        ane/ane_minicpmv4_vit_f16.m
+    )
+    # Define compile-time macro for code guards
+    target_compile_definitions(mtmd PRIVATE ENABLE_ANE)
+    
+    # Enable ARC for Objective-C files
+    set_source_files_properties(ane/ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+    set_source_files_properties(ane/ane_minicpmv4_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
+endif()
+
+target_link_libraries     (mtmd PUBLIC ggml llama common)
 target_link_libraries     (mtmd PRIVATE Threads::Threads)
 target_include_directories(mtmd PUBLIC  .)
 target_include_directories(mtmd PRIVATE ../..)
+target_include_directories(mtmd PRIVATE ../../common)
+target_include_directories(mtmd PRIVATE ../../include)
+target_include_directories(mtmd PRIVATE ../../ggml/include)
+target_include_directories(mtmd PRIVATE ../../src)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
+# Link CoreML and Accelerate frameworks when ANE is enabled
+if(ENABLE_ANE)
+    target_link_libraries(mtmd PRIVATE 
+        "-framework Foundation" 
+        "-framework CoreML" 
+        "-framework Accelerate"
+        "-ObjC"
+    )
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -31,6 +64,13 @@ set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
     )
 
+# Add ANE public headers when enabled
+if(ENABLE_ANE)
+    list(APPEND MTMD_PUBLIC_HEADERS
+        ${CMAKE_CURRENT_SOURCE_DIR}/ane/ane.h
+    )
+endif()
+
 set_target_properties(mtmd
     PROPERTIES
     PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
@@ -55,6 +95,6 @@ add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
 set(TARGET llama-mtmd-cli)
 add_executable         (${TARGET} mtmd-cli.cpp)
 set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
-install                (TARGETS ${TARGET} RUNTIME)
+install                (TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/mtmd/ane/ane.h b/tools/mtmd/ane/ane.h
new file mode 100644
index 0000000000000..e129122de2f35
--- /dev/null
+++ b/tools/mtmd/ane/ane.h
@@ -0,0 +1,11 @@
+#if __cplusplus
+extern "C" {
+#endif
+
+const void* loadModel(const char* model_path);
+void closeModel(const void* model);
+void predictWith(const void* model, float* embed, float* encoderOutput);
+
+#if __cplusplus
+}   // Extern C
+#endif
diff --git a/tools/mtmd/ane/ane.mm b/tools/mtmd/ane/ane.mm
new file mode 100644
index 0000000000000..6b3c538de1d49
--- /dev/null
+++ b/tools/mtmd/ane/ane.mm
@@ -0,0 +1,78 @@
+#import <CoreML/CoreML.h>
+#import <Accelerate/Accelerate.h>
+#import "ane.h"
+#import "ane_minicpmv4_vit_f16.h"
+#include <stdlib.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+const void* loadModel(const char* model_path) {
+    if (!model_path) {
+        NSLog(@"Error: model_path is null");
+        return nullptr;
+    }
+    
+    NSString *pathString = [NSString stringWithUTF8String:model_path];
+    
+    // Check if file exists
+    NSFileManager *fileManager = [NSFileManager defaultManager];
+    if (![fileManager fileExistsAtPath:pathString]) {
+        NSLog(@"Error: ANE model file does not exist at path: %@", pathString);
+        return nullptr;
+    }
+    
+    // Check if it's a directory (for .mlmodelc packages)
+    BOOL isDirectory;
+    if ([fileManager fileExistsAtPath:pathString isDirectory:&isDirectory]) {
+        if (!isDirectory && ![pathString hasSuffix:@".mlmodelc"]) {
+            NSLog(@"Warning: ANE model path should typically be a .mlmodelc directory: %@", pathString);
+        }
+    }
+    
+    NSURL *modelURL = [NSURL fileURLWithPath:pathString];
+    
+    NSLog(@"Loading ANE model from: %@", modelURL.absoluteString);
+    
+    NSError *error = nil;
+    const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:&error]);
+    
+    if (error) {
+        NSLog(@"Error loading ANE model: %@", error.localizedDescription);
+        return nullptr;
+    }
+    
+    if (!model) {
+        NSLog(@"Error: Failed to create ANE model instance");
+        return nullptr;
+    }
+    
+    NSLog(@"Successfully loaded ANE model from: %@", pathString);
+    return model;
+}
+
+void predictWith(const void* model, float* embed, float* encoderOutput) {
+    MLMultiArray *inMultiArray = [[MLMultiArray alloc] initWithDataPointer: embed
+                                                                      shape: @[@1, @1024, @1152]
+                                                                   dataType: MLMultiArrayDataTypeFloat32
+                                                                    strides: @[@(1179648), @(1152), @1]
+                                                                deallocator: nil
+                                                                      error: nil];
+
+    ane_minicpmv4_vit_f16Output *modelOutput = [(__bridge id)model predictionFromInput:inMultiArray error:nil];
+
+    MLMultiArray *outMA = modelOutput.output;
+
+    cblas_scopy((int)outMA.count,
+                (float*)outMA.dataPointer, 1,
+                encoderOutput, 1);
+}
+
+void closeModel(const void* model) {
+    CFRelease(model);
+}
+
+#if __cplusplus
+} //Extern C
+#endif
diff --git a/tools/mtmd/ane/ane_minicpmv4_vit_f16.h b/tools/mtmd/ane/ane_minicpmv4_vit_f16.h
new file mode 100644
index 0000000000000..d6b9a29e857ab
--- /dev/null
+++ b/tools/mtmd/ane/ane_minicpmv4_vit_f16.h
@@ -0,0 +1,154 @@
+//
+// ane_minicpmv4_vit_f16.h
+//
+// This file was automatically generated and should not be edited.
+//
+
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#include <stdint.h>
+#include <os/log.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/// Model Prediction Input Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface ane_minicpmv4_vit_f16Input : NSObject<MLFeatureProvider>
+
+/// input as 1 × 1024 × 1152 3-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * input;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithInput:(MLMultiArray *)input NS_DESIGNATED_INITIALIZER;
+
+@end
+
+/// Model Prediction Output Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface ane_minicpmv4_vit_f16Output : NSObject<MLFeatureProvider>
+
+/// output as 1 × 1024 × 1152 3-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * output;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
+
+@end
+
+/// Class for model loading and prediction
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface ane_minicpmv4_vit_f16 : NSObject
+@property (readonly, nonatomic, nullable) MLModel * model;
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
++ (nullable NSURL *)URLOfModelInThisBundle;
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of ane_minicpmv4_vit_f16.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle.
+*/
+- (nullable instancetype)init;
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Construct ane_minicpmv4_vit_f16 instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object.
+*/
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Construct ane_minicpmv4_vit_f16 instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object.
+*/
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of ane_minicpmv4_vit_f16Input to predict from
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as ane_minicpmv4_vit_f16Output
+*/
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of ane_minicpmv4_vit_f16Input to predict from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as ane_minicpmv4_vit_f16Output
+*/
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make an asynchronous prediction using the standard interface
+    @param input an instance of ane_minicpmv4_vit_f16Input to predict from
+    @param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
+*/
+- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
+
+/**
+    Make an asynchronous prediction using the standard interface
+    @param input an instance of ane_minicpmv4_vit_f16Input to predict from
+    @param options prediction options
+    @param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred.
+*/
+- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden")));
+
+/**
+    Make a prediction using the convenience interface
+    @param input 1 × 1024 × 1152 3-dimensional array of floats
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as ane_minicpmv4_vit_f16Output
+*/
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Batch prediction
+    @param inputArray array of ane_minicpmv4_vit_f16Input instances to obtain predictions from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the predictions as NSArray<ane_minicpmv4_vit_f16Output *>
+*/
+- (nullable NSArray<ane_minicpmv4_vit_f16Output *> *)predictionsFromInputs:(NSArray<ane_minicpmv4_vit_f16Input*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tools/mtmd/ane/ane_minicpmv4_vit_f16.m b/tools/mtmd/ane/ane_minicpmv4_vit_f16.m
new file mode 100644
index 0000000000000..75337a7b5b150
--- /dev/null
+++ b/tools/mtmd/ane/ane_minicpmv4_vit_f16.m
@@ -0,0 +1,222 @@
+//
+// ane_minicpmv4_vit_f16.m
+//
+// This file was automatically generated and should not be edited.
+//
+
+#if !__has_feature(objc_arc)
+#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
+#endif
+
+#import "ane_minicpmv4_vit_f16.h"
+
+@implementation ane_minicpmv4_vit_f16Input
+
+- (instancetype)initWithInput:(MLMultiArray *)input {
+    self = [super init];
+    if (self) {
+        _input = input;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"input"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"input"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.input];
+    }
+    return nil;
+}
+
+@end
+
+@implementation ane_minicpmv4_vit_f16Output
+
+- (instancetype)initWithOutput:(MLMultiArray *)output {
+    self = [super init];
+    if (self) {
+        _output = output;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"output"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"output"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.output];
+    }
+    return nil;
+}
+
+@end
+
+@implementation ane_minicpmv4_vit_f16
+
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
++ (nullable NSURL *)URLOfModelInThisBundle {
+    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"ane_minicpmv4_vit_f16" ofType:@"mlmodelc"];
+    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load ane_minicpmv4_vit_f16.mlmodelc in the bundle resource"); return nil; }
+    return [NSURL fileURLWithPath:assetPath];
+}
+
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of ane_minicpmv4_vit_f16.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model {
+    if (model == nil) {
+        return nil;
+    }
+    self = [super init];
+    if (self != nil) {
+        _model = model;
+    }
+    return self;
+}
+
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle.
+*/
+- (nullable instancetype)init {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
+}
+
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
+}
+
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Initialize ane_minicpmv4_vit_f16 instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Construct ane_minicpmv4_vit_f16 instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object.
+*/
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler {
+    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
+              configuration:configuration
+          completionHandler:handler];
+}
+
+
+/**
+    Construct ane_minicpmv4_vit_f16 instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object.
+*/
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler {
+    [MLModel loadContentsOfURL:modelURL
+                 configuration:configuration
+             completionHandler:^(MLModel *model, NSError *error) {
+        if (model != nil) {
+            ane_minicpmv4_vit_f16 *typedModel = [[ane_minicpmv4_vit_f16 alloc] initWithMLModel:model];
+            handler(typedModel, nil);
+        } else {
+            handler(nil, error);
+        }
+    }];
+}
+
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
+}
+
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
+    if (!outFeatures) { return nil; }
+    return [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
+}
+
+- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler {
+    [self.model predictionFromFeatures:input completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
+        if (prediction != nil) {
+            ane_minicpmv4_vit_f16Output *output = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
+            completionHandler(output, predictionError);
+        } else {
+            completionHandler(nil, predictionError);
+        }
+    }];
+}
+
+- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler {
+    [self.model predictionFromFeatures:input options:options completionHandler:^(id<MLFeatureProvider> prediction, NSError *predictionError) {
+        if (prediction != nil) {
+            ane_minicpmv4_vit_f16Output *output = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue];
+            completionHandler(output, predictionError);
+        } else {
+            completionHandler(nil, predictionError);
+        }
+    }];
+}
+
+- (nullable ane_minicpmv4_vit_f16Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    ane_minicpmv4_vit_f16Input *input_ = [[ane_minicpmv4_vit_f16Input alloc] initWithInput:input];
+    return [self predictionFromFeatures:input_ error:error];
+}
+
+- (nullable NSArray<ane_minicpmv4_vit_f16Output *> *)predictionsFromInputs:(NSArray<ane_minicpmv4_vit_f16Input*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
+    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
+    if (!outBatch) { return nil; }
+    NSMutableArray<ane_minicpmv4_vit_f16Output*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
+    for (NSInteger i = 0; i < outBatch.count; i++) {
+        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
+        ane_minicpmv4_vit_f16Output * result = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
+        [results addObject:result];
+    }
+    return results;
+}
+
+@end
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index fdaf9738e88cb..8091f53bdcbfe 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -10,6 +10,9 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
+#if defined(ENABLE_ANE)
+#include "ane/ane.h"
+#endif
 
 #include <cassert>
 #include <cmath>
@@ -388,6 +391,9 @@ struct clip_ctx {
     // for debugging
     bool debug_graph = false;
     std::vector<ggml_tensor *> debug_print_tensors;
+    
+    // ANE model path for iOS
+    std::string ane_model_path;
 
     clip_ctx(clip_context_params & ctx_params) {
         debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
@@ -867,8 +873,118 @@ struct clip_graph {
             int n_embd = clip_n_mmproj_embd(ctx);
             const int d_head = 128;
             int n_head = n_embd/d_head;
+
             // Use actual config value if available, otherwise fall back to hardcoded values
             int num_query = ctx->model.hparams.minicpmv_query_num;
+
+            ggml_tensor * Q = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
+                model.mm_model_attn_q_b);
+            ggml_tensor * K = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
+                model.mm_model_attn_k_b);
+            ggml_tensor * V = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
+                model.mm_model_attn_v_b);
+
+            Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
+            K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
+            V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
+
+            cb(Q, "resampler_Q", -1);
+            cb(K, "resampler_K", -1);
+            cb(V, "resampler_V", -1);
+
+            embeddings = build_attn(
+                model.mm_model_attn_o_w,
+                model.mm_model_attn_o_b,
+                Q, K, V, nullptr, kq_scale, -1);
+            cb(embeddings, "resampler_attn_out", -1);
+        }
+        // layernorm
+        embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
+
+        // projection
+        embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+
+        // build the graph
+        ggml_build_forward_expand(gf, embeddings);
+
+        return gf;
+    }
+
+    ggml_cgraph * build_minicpmv_embedding() {
+        const int batch_size = 1;
+
+        GGML_ASSERT(model.class_embedding == nullptr);
+        const int n_pos = n_patches;
+
+        // for selecting learned pos embd, used by ViT
+        struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(positions, "positions");
+        ggml_set_input(positions);
+
+        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+
+        ggml_tensor * inp = build_inp();
+        if (learned_pos_embd) {
+            inp = ggml_add(ctx0, inp, learned_pos_embd);
+            cb(inp, "pos_embed", -1);
+        }
+        ggml_tensor * embeddings = inp;
+
+        // pre-layernorm
+        if (model.pre_ln_w) {
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            ggml_set_name(embeddings, "pre_ln");
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
+        }
+
+        ggml_build_forward_expand(gf, embeddings);
+        return gf;
+    }
+
+    ggml_cgraph * build_minicpmv_resampler() {
+        const int batch_size = 1;
+
+        GGML_ASSERT(model.class_embedding == nullptr);
+        const int n_pos = n_patches;
+        
+        const int image_size_width  = img.nx;
+        const int image_size_height = img.ny;
+        const int patch_size    = hparams.patch_size;
+        const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+
+        // position embeddings for the projector (not for ViT)
+        int n_output_dim = clip_n_mmproj_embd(ctx);
+        ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
+        ggml_set_name(pos_embed, "pos_embed");
+        ggml_set_input(pos_embed);
+
+        struct ggml_tensor * embeddings = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1152, num_patches);
+        ggml_set_name(embeddings, "embeddings");
+        ggml_set_input(embeddings);
+
+        // resampler projector (it is just another transformer)
+
+        ggml_tensor * q = model.mm_model_query;
+        ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+
+        // norm
+        q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
+        v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
+
+        // k = v + pos_embed
+        ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
+
+        // attention
+        {
+            int n_embd = clip_n_mmproj_embd(ctx);
+            const int d_head = 128;
+            int n_head = n_embd/d_head;
+            // Use actual config value if available, otherwise fall back to hardcoded values
+            int num_query = ctx->model.hparams.minicpmv_query_num;
+
             ggml_tensor * Q = ggml_add(ctx0,
                 ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
                 model.mm_model_attn_q_b);
@@ -3095,9 +3211,10 @@ struct llava_uhd {
         const int original_width  = original_size.width;
         const int original_height = original_size.height;
 
-        const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
+        bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
         const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
 
+        // has_slices = false;
         if (!has_slices) {
             // skip slicing logic
             res.overview_size = clip_image_size{slice_size, slice_size};
@@ -3723,15 +3840,362 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
     return pos_embed_2d;
 }
 
+#if defined(ENABLE_ANE)
+static bool clip_image_encode_ane(float * data, float * vec, const char* ane_model_path) {
+
+    static int flag = 0;
+    static const void* coremlEncoder = NULL;
+    static std::string cached_model_path = "";
+    
+    // Check if we need to load a new model
+    if (flag == 0 || (ane_model_path && cached_model_path != ane_model_path)) {
+        if (coremlEncoder) {
+            closeModel(coremlEncoder);
+        }
+        coremlEncoder = loadModel(ane_model_path);
+        if (!coremlEncoder) {
+            printf("Failed to load ANE model from: %s\n", ane_model_path ? ane_model_path : "null");
+            return false;
+        }
+        cached_model_path = ane_model_path ? ane_model_path : "";
+        flag = 1;
+    }
+    predictWith(coremlEncoder, data, vec);
+    return true;
+}
+#endif
+
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     clip_image_f32_batch imgs;
     clip_image_f32_ptr img_copy(clip_image_f32_init());
     *img_copy = *img;
     imgs.entries.push_back(std::move(img_copy));
 
+#if defined(ENABLE_ANE)
+    bool ios_ctx = true;
+    if (ios_ctx){
+        printf("clip use ane\n");
+        float * vit_embedding1 = (float *)malloc(1100*1152*sizeof(float));
+        float * vit_embedding2 = (float *)malloc(1100*1152*sizeof(float));
+
+        ane_embedding(ctx, n_threads, &imgs, vit_embedding1);
+        clip_image_encode_ane(vit_embedding1, vit_embedding2, ctx->ane_model_path.c_str());
+        ane_resampler(ctx, n_threads, &imgs, vit_embedding2, vec);
+        free(vit_embedding1);
+        free(vit_embedding2);
+        return true;
+    }
+#endif
+
     return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
 
+#if defined(ENABLE_ANE)
+static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+    const clip_image_f32_batch & imgs = *imgs_c_ptr;
+    int batch_size = imgs.entries.size();
+
+    // TODO @ngxson : implement batch size > 1 as a loop
+    //                we don't need true batching support because the cgraph will gonna be big anyway
+    if (batch_size != 1) {
+        return false; // only support batch size of 1
+    }
+
+    // build the inference graph
+    ctx->debug_print_tensors.clear();
+    ggml_backend_sched_reset(ctx->sched.get());
+    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
+    clip_graph graph(ctx, *imgs.entries[0]);
+    ggml_cgraph * gf;
+    gf = graph.build_minicpmv_embedding();
+    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
+
+    // set inputs
+    const auto & model   = ctx->model;
+    const auto & hparams = model.hparams;
+
+    const int image_size_width  = imgs.entries[0]->nx;
+    const int image_size_height = imgs.entries[0]->ny;
+
+    const int patch_size    = hparams.patch_size;
+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
+    const int pos_w = image_size_width  / patch_size;
+    const int pos_h = image_size_height / patch_size;
+
+    auto get_inp_tensor = [&gf](const char * name) {
+        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
+        if (inp == nullptr) {
+            GGML_ABORT("Failed to get tensor %s", name);
+        }
+        if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
+            GGML_ABORT("Tensor %s is not an input tensor", name);
+        }
+        return inp;
+    };
+
+    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_F32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_I32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+    // set input pixel values
+    if (!imgs.is_audio) {
+        size_t nelem = 0;
+        for (const auto & img : imgs.entries) {
+            nelem += img->nx * img->ny * 3;
+        }
+        std::vector<float> inp_raw(nelem);
+
+        // layout of data (note: the channel dim is unrolled to better visualize the layout):
+        //
+        // ┌──W──┐
+        // │     H │  channel = R
+        // ├─────┤ │
+        // │     H │  channel = G
+        // ├─────┤ │
+        // │     H │  channel = B
+        // └─────┘ │
+        //   ──────┘ x B
+
+        for (size_t i = 0; i < imgs.entries.size(); i++) {
+            const int nx = imgs.entries[i]->nx;
+            const int ny = imgs.entries[i]->ny;
+            const int n = nx * ny;
+
+            for (int b = 0; b < batch_size; b++) {
+                float * batch_entry = inp_raw.data() + b * (3*n);
+                for (int y = 0; y < ny; y++) {
+                    for (int x = 0; x < nx; x++) {
+                        size_t base_src = 3*(y * nx + x); // idx of the first channel
+                        size_t base_dst =    y * nx + x;  // idx of the first channel
+                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
+                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
+                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
+                    }
+                }
+            }
+        }
+        set_input_f32("inp_raw", inp_raw);
+
+    } else {
+        // audio input
+        GGML_ASSERT(imgs.entries.size() == 1);
+        const auto & mel_inp = imgs.entries[0];
+        const int n_step = mel_inp->nx;
+        const int n_mel  = mel_inp->ny;
+        std::vector<float> inp_raw(n_step * n_mel);
+        std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
+        set_input_f32("inp_raw", inp_raw);
+    }
+
+    switch (ctx->model.proj_type) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                // inspired from siglip:
+                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
+                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
+                std::vector<int32_t> positions(pos_h * pos_w);
+                int bucket_coords_h[1024];
+                int bucket_coords_w[1024];
+                for (int i = 0; i < pos_h; i++){
+                    bucket_coords_h[i] = std::floor(70.0*i/pos_h);
+                }
+                for (int i = 0; i < pos_w; i++){
+                    bucket_coords_w[i] = std::floor(70.0*i/pos_w);
+                }
+                for (int i = 0, id = 0; i < pos_h; i++){
+                    for (int j = 0; j < pos_w; j++){
+                        positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+                    }
+                }
+                set_input_i32("positions", positions);
+            } break;
+            default:
+            GGML_ABORT("Unknown projector type");
+    }
+
+    // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+    ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
+    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+    if (reg) {
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
+        }
+    }
+
+    auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
+        return false;
+    }
+
+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
+    // the last node is the embedding tensor
+    ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+
+    // sanity check (only support batch size of 1 for now)
+    const int n_tokens_out = embeddings->ne[1];
+
+    // copy the embeddings to the location passed by the user
+    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+
+    return true;
+}
+
+static bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) {
+    const clip_image_f32_batch & imgs = *imgs_c_ptr;
+    int batch_size = imgs.entries.size();
+
+    // TODO @ngxson : implement batch size > 1 as a loop
+    //                we don't need true batching support because the cgraph will gonna be big anyway
+    if (batch_size != 1) {
+        return false; // only support batch size of 1
+    }
+
+    // build the inference graph
+    ctx->debug_print_tensors.clear();
+    ggml_backend_sched_reset(ctx->sched.get());
+    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
+    clip_graph graph(ctx, *imgs.entries[0]);
+    ggml_cgraph * gf;
+    gf = graph.build_minicpmv_resampler();
+    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
+
+    // set inputs
+    const auto & model   = ctx->model;
+    const auto & hparams = model.hparams;
+
+    const int image_size_width  = imgs.entries[0]->nx;
+    const int image_size_height = imgs.entries[0]->ny;
+
+    const int patch_size    = hparams.patch_size;
+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
+    const int pos_w = image_size_width  / patch_size;
+    const int pos_h = image_size_height / patch_size;
+
+    auto get_inp_tensor = [&gf](const char * name) {
+        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
+        if (inp == nullptr) {
+            GGML_ABORT("Failed to get tensor %s", name);
+        }
+        if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
+            GGML_ABORT("Tensor %s is not an input tensor", name);
+        }
+        return inp;
+    };
+
+    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_F32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_I32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    {
+        struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
+        ggml_backend_tensor_set(embeddings, vit_embedding, 0, ggml_nbytes(embeddings));
+
+    }
+    
+    switch (ctx->model.proj_type) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                // inspired from resampler of Qwen-VL:
+                //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
+                //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
+                int embed_dim = clip_n_mmproj_embd(ctx);
+
+                // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
+                auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
+
+                std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
+                for(int i = 0; i < pos_w * pos_h; ++i){
+                    for(int j = 0; j < embed_dim; ++j){
+                        pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
+                    }
+                }
+
+                set_input_f32("pos_embed", pos_embed);
+            } break;
+            default:
+            GGML_ABORT("Unknown projector type");
+    }
+
+    // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+    ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
+    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+    if (reg) {
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
+        }
+    }
+
+    auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
+        return false;
+    }
+
+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
+    // the last node is the embedding tensor
+    ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+
+    // sanity check (only support batch size of 1 for now)
+    const int n_tokens_out = embeddings->ne[1];
+    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
+    if (n_tokens_out != expected_n_tokens_out) {
+        LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
+        GGML_ABORT("Invalid number of output tokens");
+    }
+
+    // copy the embeddings to the location passed by the user
+    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+
+    return true;
+}
+#endif
+
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
@@ -4209,3 +4673,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
     batch->entries.push_back(clip_image_f32_ptr(audio));
     batch->is_audio = true;
 }
+
+void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path) {
+    if (ctx && ane_model_path) {
+        ctx->ane_model_path = ane_model_path;
+    }
+}
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 08f3efb7b1daf..f5524f2e33648 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -109,3 +109,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
+
+// ANE support functions
+void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path);
diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
index 4dda60a21164b..be982a2d48909 100644
--- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
+++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -824,13 +824,17 @@ def _replace_name(s, v):
     if re.match("vision_model.embeddings.position_embedding", s):
         v = v.unsqueeze(0)
         return {s: v}
-
-    return {s: v}
+    print(s)
+    if "emb" in s:
+        return {s: v}
+    return None
 
 state_dict = model.state_dict()
 new_state_dict = {}
 for k, v in state_dict.items():
     kvs = _replace_name(k, v)
+    if kvs is None:
+        continue
     for nk, nv in kvs.items():
         new_state_dict[nk] = nv
 state_dict = new_state_dict
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 599e682e0f894..b2bc16b450116 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -132,6 +132,7 @@ struct mtmd_cli_context {
         mparams.print_timings = true;
         mparams.n_threads = params.cpuparams.n_threads;
         mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
+        mparams.ane_model_path = params.ane.path.empty() ? nullptr : params.ane.path.c_str();
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index a05373d5b3ca5..9b3f2292bf486 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -10,6 +10,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 #include <limits>
 #include <vector>
 
@@ -91,6 +92,7 @@ mtmd_context_params mtmd_context_params_default() {
     params.verbosity = GGML_LOG_LEVEL_INFO;
     params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
     params.media_marker = mtmd_default_marker();
+    params.ane_model_path = nullptr;
     return params;
 }
 
@@ -155,6 +157,19 @@ struct mtmd_context {
         auto res = clip_init(mmproj_fname, ctx_clip_params);
         ctx_v = res.ctx_v;
         ctx_a = res.ctx_a;
+        
+        // Set ANE model path for iOS
+        if (ctx_params.ane_model_path && ctx_v) {
+            // Check if ANE model file exists
+            std::ifstream ane_file(ctx_params.ane_model_path);
+            if (!ane_file.good()) {
+                throw std::runtime_error(string_format("ANE model file does not exist: %s", ctx_params.ane_model_path));
+            }
+            ane_file.close();
+            
+            clip_set_ane_model_path(ctx_v, ctx_params.ane_model_path);
+            LOG_INF("ANE model path set to: %s\n", ctx_params.ane_model_path);
+        }
         if (!ctx_v && !ctx_a) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index f4ea07d3ad521..331992e76f43e 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -82,6 +82,7 @@ struct mtmd_context_params {
     enum ggml_log_level verbosity;
     const char * image_marker; // deprecated, use media_marker instead
     const char * media_marker;
+    const char * ane_model_path; // path to ANE model for iOS
 };
 
 MTMD_API const char * mtmd_default_marker(void);
diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt
index 3e68640933afb..ed0825d8eda69 100644
--- a/tools/perplexity/CMakeLists.txt
+++ b/tools/perplexity/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index 47e5cbe30cfe3..a5575124aef20 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt
index d0189596980eb..e351f1a1f30be 100644
--- a/tools/run/CMakeLists.txt
+++ b/tools/run/CMakeLists.txt
@@ -10,6 +10,6 @@ if (LLAMA_CURL)
     set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES})
 endif ()
 
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/tokenize/CMakeLists.txt b/tools/tokenize/CMakeLists.txt
index 1690b53e5d52b..f9dcd270d5f05 100644
--- a/tools/tokenize/CMakeLists.txt
+++ b/tools/tokenize/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-tokenize)
 add_executable(${TARGET} tokenize.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt
index c72bd814c3b31..da50e9bf848a8 100644
--- a/tools/tts/CMakeLists.txt
+++ b/tools/tts/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-tts)
 add_executable(${TARGET} tts.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
 target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)