optimized interface

tc-mb · tc-mb · commit 54258e9cf66b · 2025-08-12T16:02:25.000+08:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -960,6 +960,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
                 common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
+                common_params_handle_model(params.ane,       params.hf_token, "", params.offline);
                 break;
             }
         }
@@ -2243,6 +2244,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.mmproj_use_gpu = false;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    add_opt(common_arg(
+        {"--ane"}, "FILE",
+        "path to Apple Neural Engine model file for iOS",
+        [](common_params & params, const std::string & value) {
+            params.ane.path = value;
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_ANE"));
     add_opt(common_arg(
         {"--image", "--audio"}, "FILE",
         "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
diff --git a/common/common.h b/common/common.h
@@ -353,6 +353,9 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+    
+    // Apple Neural Engine support
+    struct common_params_model ane;
 
     // embedding
     bool embedding         = false; // get only sentence embedding
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
@@ -2,6 +2,9 @@
 
 find_package(Threads REQUIRED)
 
+# ANE support option
+option(ENABLE_ANE "Enable Apple Neural Engine support" OFF)
+
 add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
@@ -15,8 +18,8 @@ add_library(mtmd
             mtmd-ios.h
             )
 
-# Add ANE related files on Apple platforms
-if(APPLE)
+# Add ANE related files when enabled
+if(ENABLE_ANE)
     target_sources(mtmd PRIVATE
         ane/ane.h
         ane/ane.mm
@@ -40,8 +43,8 @@ target_include_directories(mtmd PRIVATE ../../src)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
-# Link CoreML and Accelerate frameworks on Apple platforms
-if(APPLE)
+# Link CoreML and Accelerate frameworks when ANE is enabled
+if(ENABLE_ANE)
     target_link_libraries(mtmd PRIVATE 
         "-framework Foundation" 
         "-framework CoreML" 
@@ -62,8 +65,8 @@ set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-ios.h
     )
 
-# Add ANE public headers on Apple platforms
-if(APPLE)
+# Add ANE public headers when enabled
+if(ENABLE_ANE)
     list(APPEND MTMD_PUBLIC_HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/ane/ane.h
     )
@@ -117,8 +120,8 @@ if (TARGET common)
     add_dependencies(${TARGET_IOS} common)
 endif()
 
-# Add additional linking settings for iOS testing on Apple platforms
-if(APPLE)
+# Add additional linking settings for iOS testing when ANE is enabled
+if(ENABLE_ANE)
     target_link_libraries(${TARGET_IOS} PRIVATE 
         "-framework Foundation" 
         "-framework CoreML" 
diff --git a/tools/mtmd/ane/ane.h b/tools/mtmd/ane/ane.h
@@ -2,7 +2,7 @@
 extern "C" {
 #endif
 
-const void* loadModel();
+const void* loadModel(const char* model_path);
 void closeModel(const void* model);
 void predictWith(const void* model, float* embed, float* encoderOutput);
 
diff --git a/tools/mtmd/ane/ane.mm b/tools/mtmd/ane/ane.mm
@@ -8,18 +8,30 @@
 extern "C" {
 #endif
 
-const void* loadModel() {
-    // 新的，从 documents directionary 中加载 begin
-    // 获取文件管理器实例
-    NSFileManager *fileManager = [NSFileManager defaultManager];
-    // 获取应用的 Documents 目录的 URL
-    NSURL *documentsURL = [[fileManager URLsForDirectory:NSDocumentDirectory inDomains:NSUserDomainMask] firstObject];
-    NSString *pathString = [documentsURL.absoluteString stringByAppendingString:@"ane_minicpmv4_vit_f16.mlmodelc"];
-    NSURL *modelURL = [NSURL URLWithString:pathString];
-
-    NSLog(modelURL.absoluteString);
-
-    const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:nil]);
+const void* loadModel(const char* model_path) {
+    if (!model_path) {
+        NSLog(@"Error: model_path is null");
+        return nullptr;
+    }
+    
+    NSString *pathString = [NSString stringWithUTF8String:model_path];
+    NSURL *modelURL = [NSURL fileURLWithPath:pathString];
+    
+    NSLog(@"Loading ANE model from: %@", modelURL.absoluteString);
+    
+    NSError *error = nil;
+    const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:&error]);
+    
+    if (error) {
+        NSLog(@"Error loading ANE model: %@", error.localizedDescription);
+        return nullptr;
+    }
+    
+    if (!model) {
+        NSLog(@"Error: Failed to create ANE model instance");
+        return nullptr;
+    }
+    
     return model;
 }
 
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -380,6 +380,9 @@ struct clip_ctx {
     // for debugging
     bool debug_graph = false;
     std::vector<ggml_tensor *> debug_print_tensors;
+    
+    // ANE model path for iOS
+    std::string ane_model_path;
 
     clip_ctx(clip_context_params & ctx_params) {
         debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
@@ -3803,15 +3806,27 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
 }
 
 #ifdef __APPLE__
-static bool clip_image_encode_ane(float * data, float * vec) {
+static bool clip_image_encode_ane(float * data, float * vec, const char* ane_model_path) {
 
     static int flag = 0;
     static const void* coremlEncoder = NULL;
-    if (flag == 0) {
-        coremlEncoder = loadModel();
+    static std::string cached_model_path = "";
+    
+    // Check if we need to load a new model
+    if (flag == 0 || (ane_model_path && cached_model_path != ane_model_path)) {
+        if (coremlEncoder) {
+            closeModel(coremlEncoder);
+        }
+        coremlEncoder = loadModel(ane_model_path);
+        if (!coremlEncoder) {
+            printf("Failed to load ANE model from: %s\n", ane_model_path ? ane_model_path : "null");
+            return false;
+        }
+        cached_model_path = ane_model_path ? ane_model_path : "";
         flag = 1;
     }
     predictWith(coremlEncoder, data, vec);
+    return true;
 }
 #endif
 
@@ -3829,7 +3844,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
         float * vit_embedding2 = (float *)malloc(1100*1152*sizeof(float));
 
         ane_embedding(ctx, n_threads, &imgs, vit_embedding1);
-        clip_image_encode_ane(vit_embedding1, vit_embedding2);
+        clip_image_encode_ane(vit_embedding1, vit_embedding2, ctx->ane_model_path.c_str());
         ane_resampler(ctx, n_threads, &imgs, vit_embedding2, vec);
         free(vit_embedding1);
         free(vit_embedding2);
@@ -4634,3 +4649,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
     batch->entries.push_back(clip_image_f32_ptr(audio));
     batch->is_audio = true;
 }
+
+void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path) {
+    if (ctx && ane_model_path) {
+        ctx->ane_model_path = ane_model_path;
+    }
+}
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -112,3 +112,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
+
+// ANE support functions
+void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path);
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
@@ -132,6 +132,7 @@ struct mtmd_cli_context {
         mparams.print_timings = true;
         mparams.n_threads = params.cpuparams.n_threads;
         mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
+        mparams.ane_model_path = params.ane.path.empty() ? nullptr : params.ane.path.c_str();
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
diff --git a/tools/mtmd/mtmd-ios.cpp b/tools/mtmd/mtmd-ios.cpp
@@ -61,11 +61,14 @@ mtmd_ios_params mtmd_ios_params_default(void) {
     mtmd_ios_params params = {};
     params.model_path = "";
     params.mmproj_path = "";
+    params.ane_path = "";
     params.n_predict = -1;
     params.n_ctx = 4096;
     params.n_threads = 4;
     params.temperature = 0.2f;
     params.use_gpu = true;
+    params.mmproj_use_gpu = true;
+    params.warmup = true;
     return params;
 }
 
@@ -86,6 +89,7 @@ mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
     common_params common_params;
     common_params.model.path = params->model_path;
     common_params.mmproj.path = params->mmproj_path;
+    common_params.ane.path = params->ane_path;
     common_params.n_ctx = params->n_ctx;
     common_params.n_batch = 2048;  // 增加batch大小，与标准mtmd保持一致
     common_params.cpuparams.n_threads = params->n_threads;
@@ -132,6 +136,7 @@ mtmd_ios_context* mtmd_ios_init(const mtmd_ios_params* params) {
     mparams.print_timings = false;
     mparams.n_threads = params->n_threads;
     mparams.verbosity = GGML_LOG_LEVEL_INFO;
+    mparams.ane_model_path = params->ane_path.empty() ? nullptr : params->ane_path.c_str();
     
     ctx->ctx_vision.reset(mtmd_init_from_file(params->mmproj_path.c_str(), ctx->model, mparams));
     if (!ctx->ctx_vision.get()) {
diff --git a/tools/mtmd/mtmd-ios.h b/tools/mtmd/mtmd-ios.h
@@ -15,6 +15,7 @@ typedef struct mtmd_ios_context mtmd_ios_context;
 typedef struct mtmd_ios_params {
     std::string model_path;
     std::string mmproj_path;
+    std::string ane_path;
     int         n_predict;
     int         n_ctx;
     int         n_threads;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -91,6 +91,7 @@ mtmd_context_params mtmd_context_params_default() {
     params.verbosity = GGML_LOG_LEVEL_INFO;
     params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
     params.media_marker = mtmd_default_marker();
+    params.ane_model_path = nullptr;
     return params;
 }
 
@@ -155,6 +156,11 @@ struct mtmd_context {
         auto res = clip_init(mmproj_fname, ctx_clip_params);
         ctx_v = res.ctx_v;
         ctx_a = res.ctx_a;
+        
+        // Set ANE model path for iOS
+        if (ctx_params.ane_model_path && ctx_v) {
+            clip_set_ane_model_path(ctx_v, ctx_params.ane_model_path);
+        }
         if (!ctx_v && !ctx_a) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
@@ -82,6 +82,7 @@ struct mtmd_context_params {
     enum ggml_log_level verbosity;
     const char * image_marker; // deprecated, use media_marker instead
     const char * media_marker;
+    const char * ane_model_path; // path to ANE model for iOS
 };
 
 MTMD_API const char * mtmd_default_marker(void);