diff --git a/build-xcframework.sh b/build-xcframework.sh index f813984db9dbd..8a8f2af41df56 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -8,7 +8,7 @@ TVOS_MIN_OS_VERSION=16.4 BUILD_SHARED_LIBS=OFF LLAMA_BUILD_EXAMPLES=OFF -LLAMA_BUILD_TOOLS=OFF +LLAMA_BUILD_TOOLS=ON LLAMA_BUILD_TESTS=OFF LLAMA_BUILD_SERVER=OFF GGML_METAL=ON @@ -124,6 +124,10 @@ setup_framework_structure() { cp ggml/include/ggml-cpu.h ${header_path} cp ggml/include/ggml-blas.h ${header_path} cp ggml/include/gguf.h ${header_path} + # Copy mtmd-ios headers and dependencies + cp tools/mtmd/mtmd-ios.h ${header_path} + cp tools/mtmd/mtmd.h ${header_path} + cp tools/mtmd/mtmd-helper.h ${header_path} # Create module map (common for all platforms) cat > ${module_path}module.modulemap << EOF @@ -136,6 +140,9 @@ framework module llama { header "ggml-cpu.h" header "ggml-blas.h" header "gguf.h" + header "mtmd-ios.h" + header "mtmd.h" + header "mtmd-helper.h" link "c++" link framework "Accelerate" @@ -252,6 +259,8 @@ combine_static_libraries() { "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a" "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a" "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a" + "${base_dir}/${build_dir}/common/${release_dir}/libcommon.a" + "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a" ) # Create temporary directory for processing @@ -327,7 +336,7 @@ combine_static_libraries() { $arch_flags \ $min_version_flag \ -Wl,-force_load,"${temp_dir}/combined.a" \ - -framework Foundation -framework Metal -framework Accelerate \ + -framework Foundation -framework Metal -framework Accelerate -framework CoreML \ -install_name "$install_name" \ -o "${base_dir}/${output_lib}" diff --git a/common/arg.cpp b/common/arg.cpp index 3d18aaa171ce4..9725df5021f9b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -961,6 +961,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context for (auto & ex : mmproj_examples) { if (ctx_arg.ex == ex) { common_params_handle_model(params.mmproj, params.hf_token, "", params.offline); + common_params_handle_model(params.ane, params.hf_token, "", params.offline); break; } } @@ -2263,6 +2264,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.mmproj_use_gpu = false; } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD")); + add_opt(common_arg( + {"--ane"}, "FILE", + "path to Apple Neural Engine model file for iOS", + [](common_params & params, const std::string & value) { + params.ane.path = value; + } + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_ANE")); add_opt(common_arg( {"--image", "--audio"}, "FILE", "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n", diff --git a/common/common.h b/common/common.h index 5eab199af559e..1167c097d2a6e 100644 --- a/common/common.h +++ b/common/common.h @@ -375,6 +375,9 @@ struct common_params { bool mmproj_use_gpu = true; // use GPU for multimodal model bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) + + // Apple Neural Engine support + struct common_params_model ane; // embedding bool embedding = false; // get only sentence embedding diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt index 68ad707f32c98..b8e652c979f13 100644 --- a/tools/batched-bench/CMakeLists.txt +++ b/tools/batched-bench/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/cvector-generator/CMakeLists.txt b/tools/cvector-generator/CMakeLists.txt index 49ad9561c82ea..e70a76523d8c4 100644 --- a/tools/cvector-generator/CMakeLists.txt +++ b/tools/cvector-generator/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-cvector-generator) add_executable(${TARGET} cvector-generator.cpp pca.hpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/export-lora/CMakeLists.txt b/tools/export-lora/CMakeLists.txt index 310455787a7ef..69330896fd940 100644 --- a/tools/export-lora/CMakeLists.txt +++ b/tools/export-lora/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/gguf-split/CMakeLists.txt b/tools/gguf-split/CMakeLists.txt index c407e2f0af44a..bef3fb86f00fd 100644 --- a/tools/gguf-split/CMakeLists.txt +++ b/tools/gguf-split/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index 412696c47c31c..73d7696dce6bb 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt index 17e3b9b87bae4..de81c0bc5d460 100644 --- a/tools/llama-bench/CMakeLists.txt +++ b/tools/llama-bench/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-bench) add_executable(${TARGET} llama-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt index af3d9150f8640..f380fcae3c2c8 100644 --- a/tools/main/CMakeLists.txt +++ b/tools/main/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-cli) add_executable(${TARGET} main.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 4baa15b9609fc..4e46bfd3f55ba 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -2,6 +2,9 @@ find_package(Threads REQUIRED) +# ANE support option +option(ENABLE_ANE "Enable Apple Neural Engine support" OFF) + add_library(mtmd mtmd.cpp mtmd-audio.cpp @@ -13,13 +16,43 @@ add_library(mtmd mtmd-helper.h ) -target_link_libraries (mtmd PUBLIC ggml llama) +# Add ANE related files when enabled +if(ENABLE_ANE) + target_sources(mtmd PRIVATE + ane/ane.h + ane/ane.mm + ane/ane_minicpmv4_vit_f16.h + ane/ane_minicpmv4_vit_f16.m + ) + # Define compile-time macro for code guards + target_compile_definitions(mtmd PRIVATE ENABLE_ANE) + + # Enable ARC for Objective-C files + set_source_files_properties(ane/ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc") + set_source_files_properties(ane/ane_minicpmv4_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc") +endif() + +target_link_libraries (mtmd PUBLIC ggml llama common) target_link_libraries (mtmd PRIVATE Threads::Threads) target_include_directories(mtmd PUBLIC .) target_include_directories(mtmd PRIVATE ../..) +target_include_directories(mtmd PRIVATE ../../common) +target_include_directories(mtmd PRIVATE ../../include) +target_include_directories(mtmd PRIVATE ../../ggml/include) +target_include_directories(mtmd PRIVATE ../../src) target_include_directories(mtmd PRIVATE ../../vendor) target_compile_features (mtmd PRIVATE cxx_std_17) +# Link CoreML and Accelerate frameworks when ANE is enabled +if(ENABLE_ANE) + target_link_libraries(mtmd PRIVATE + "-framework Foundation" + "-framework CoreML" + "-framework Accelerate" + "-ObjC" + ) +endif() + if (BUILD_SHARED_LIBS) set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(mtmd PRIVATE LLAMA_BUILD) @@ -31,6 +64,13 @@ set(MTMD_PUBLIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h ) +# Add ANE public headers when enabled +if(ENABLE_ANE) + list(APPEND MTMD_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/ane/ane.h + ) +endif() + set_target_properties(mtmd PROPERTIES PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}") @@ -55,6 +95,6 @@ add_executable(llama-qwen2vl-cli deprecation-warning.cpp) set(TARGET llama-mtmd-cli) add_executable (${TARGET} mtmd-cli.cpp) set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) -install (TARGETS ${TARGET} RUNTIME) +install (TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/mtmd/ane/ane.h b/tools/mtmd/ane/ane.h new file mode 100644 index 0000000000000..e129122de2f35 --- /dev/null +++ b/tools/mtmd/ane/ane.h @@ -0,0 +1,11 @@ +#if __cplusplus +extern "C" { +#endif + +const void* loadModel(const char* model_path); +void closeModel(const void* model); +void predictWith(const void* model, float* embed, float* encoderOutput); + +#if __cplusplus +} // Extern C +#endif diff --git a/tools/mtmd/ane/ane.mm b/tools/mtmd/ane/ane.mm new file mode 100644 index 0000000000000..6b3c538de1d49 --- /dev/null +++ b/tools/mtmd/ane/ane.mm @@ -0,0 +1,78 @@ +#import +#import +#import "ane.h" +#import "ane_minicpmv4_vit_f16.h" +#include + +#if __cplusplus +extern "C" { +#endif + +const void* loadModel(const char* model_path) { + if (!model_path) { + NSLog(@"Error: model_path is null"); + return nullptr; + } + + NSString *pathString = [NSString stringWithUTF8String:model_path]; + + // Check if file exists + NSFileManager *fileManager = [NSFileManager defaultManager]; + if (![fileManager fileExistsAtPath:pathString]) { + NSLog(@"Error: ANE model file does not exist at path: %@", pathString); + return nullptr; + } + + // Check if it's a directory (for .mlmodelc packages) + BOOL isDirectory; + if ([fileManager fileExistsAtPath:pathString isDirectory:&isDirectory]) { + if (!isDirectory && ![pathString hasSuffix:@".mlmodelc"]) { + NSLog(@"Warning: ANE model path should typically be a .mlmodelc directory: %@", pathString); + } + } + + NSURL *modelURL = [NSURL fileURLWithPath:pathString]; + + NSLog(@"Loading ANE model from: %@", modelURL.absoluteString); + + NSError *error = nil; + const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:&error]); + + if (error) { + NSLog(@"Error loading ANE model: %@", error.localizedDescription); + return nullptr; + } + + if (!model) { + NSLog(@"Error: Failed to create ANE model instance"); + return nullptr; + } + + NSLog(@"Successfully loaded ANE model from: %@", pathString); + return model; +} + +void predictWith(const void* model, float* embed, float* encoderOutput) { + MLMultiArray *inMultiArray = [[MLMultiArray alloc] initWithDataPointer: embed + shape: @[@1, @1024, @1152] + dataType: MLMultiArrayDataTypeFloat32 + strides: @[@(1179648), @(1152), @1] + deallocator: nil + error: nil]; + + ane_minicpmv4_vit_f16Output *modelOutput = [(__bridge id)model predictionFromInput:inMultiArray error:nil]; + + MLMultiArray *outMA = modelOutput.output; + + cblas_scopy((int)outMA.count, + (float*)outMA.dataPointer, 1, + encoderOutput, 1); +} + +void closeModel(const void* model) { + CFRelease(model); +} + +#if __cplusplus +} //Extern C +#endif diff --git a/tools/mtmd/ane/ane_minicpmv4_vit_f16.h b/tools/mtmd/ane/ane_minicpmv4_vit_f16.h new file mode 100644 index 0000000000000..d6b9a29e857ab --- /dev/null +++ b/tools/mtmd/ane/ane_minicpmv4_vit_f16.h @@ -0,0 +1,154 @@ +// +// ane_minicpmv4_vit_f16.h +// +// This file was automatically generated and should not be edited. +// + +#import +#import +#include +#include + +NS_ASSUME_NONNULL_BEGIN + +/// Model Prediction Input Type +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface ane_minicpmv4_vit_f16Input : NSObject + +/// input as 1 × 1024 × 1152 3-dimensional array of floats +@property (readwrite, nonatomic, strong) MLMultiArray * input; +- (instancetype)init NS_UNAVAILABLE; +- (instancetype)initWithInput:(MLMultiArray *)input NS_DESIGNATED_INITIALIZER; + +@end + +/// Model Prediction Output Type +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface ane_minicpmv4_vit_f16Output : NSObject + +/// output as 1 × 1024 × 1152 3-dimensional array of floats +@property (readwrite, nonatomic, strong) MLMultiArray * output; +- (instancetype)init NS_UNAVAILABLE; +- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER; + +@end + +/// Class for model loading and prediction +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface ane_minicpmv4_vit_f16 : NSObject +@property (readonly, nonatomic, nullable) MLModel * model; + +/** + URL of the underlying .mlmodelc directory. +*/ ++ (nullable NSURL *)URLOfModelInThisBundle; + +/** + Initialize ane_minicpmv4_vit_f16 instance from an existing MLModel object. + + Usually the application does not use this initializer unless it makes a subclass of ane_minicpmv4_vit_f16. + Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in. +*/ +- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER; + +/** + Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle. +*/ +- (nullable instancetype)init; + +/** + Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle. + + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Initialize ane_minicpmv4_vit_f16 instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16. + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Initialize ane_minicpmv4_vit_f16 instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16. + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Construct ane_minicpmv4_vit_f16 instance asynchronously with configuration. + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object. +*/ ++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler; + +/** + Construct ane_minicpmv4_vit_f16 instance asynchronously with URL of .mlmodelc directory and optional configuration. + + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param modelURL The model URL. + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object. +*/ ++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler; + +/** + Make a prediction using the standard interface + @param input an instance of ane_minicpmv4_vit_f16Input to predict from + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as ane_minicpmv4_vit_f16Output +*/ +- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Make a prediction using the standard interface + @param input an instance of ane_minicpmv4_vit_f16Input to predict from + @param options prediction options + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as ane_minicpmv4_vit_f16Output +*/ +- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Make an asynchronous prediction using the standard interface + @param input an instance of ane_minicpmv4_vit_f16Input to predict from + @param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred. +*/ +- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden"))); + +/** + Make an asynchronous prediction using the standard interface + @param input an instance of ane_minicpmv4_vit_f16Input to predict from + @param options prediction options + @param completionHandler a block that will be called upon completion of the prediction. error will be nil if no error occurred. +*/ +- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler API_AVAILABLE(macos(14.0), ios(17.0), watchos(10.0), tvos(17.0)) __attribute__((visibility("hidden"))); + +/** + Make a prediction using the convenience interface + @param input 1 × 1024 × 1152 3-dimensional array of floats + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as ane_minicpmv4_vit_f16Output +*/ +- (nullable ane_minicpmv4_vit_f16Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Batch prediction + @param inputArray array of ane_minicpmv4_vit_f16Input instances to obtain predictions from + @param options prediction options + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the predictions as NSArray +*/ +- (nullable NSArray *)predictionsFromInputs:(NSArray *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error; +@end + +NS_ASSUME_NONNULL_END diff --git a/tools/mtmd/ane/ane_minicpmv4_vit_f16.m b/tools/mtmd/ane/ane_minicpmv4_vit_f16.m new file mode 100644 index 0000000000000..75337a7b5b150 --- /dev/null +++ b/tools/mtmd/ane/ane_minicpmv4_vit_f16.m @@ -0,0 +1,222 @@ +// +// ane_minicpmv4_vit_f16.m +// +// This file was automatically generated and should not be edited. +// + +#if !__has_feature(objc_arc) +#error This file must be compiled with automatic reference counting enabled (-fobjc-arc) +#endif + +#import "ane_minicpmv4_vit_f16.h" + +@implementation ane_minicpmv4_vit_f16Input + +- (instancetype)initWithInput:(MLMultiArray *)input { + self = [super init]; + if (self) { + _input = input; + } + return self; +} + +- (NSSet *)featureNames { + return [NSSet setWithArray:@[@"input"]]; +} + +- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { + if ([featureName isEqualToString:@"input"]) { + return [MLFeatureValue featureValueWithMultiArray:self.input]; + } + return nil; +} + +@end + +@implementation ane_minicpmv4_vit_f16Output + +- (instancetype)initWithOutput:(MLMultiArray *)output { + self = [super init]; + if (self) { + _output = output; + } + return self; +} + +- (NSSet *)featureNames { + return [NSSet setWithArray:@[@"output"]]; +} + +- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { + if ([featureName isEqualToString:@"output"]) { + return [MLFeatureValue featureValueWithMultiArray:self.output]; + } + return nil; +} + +@end + +@implementation ane_minicpmv4_vit_f16 + + +/** + URL of the underlying .mlmodelc directory. +*/ ++ (nullable NSURL *)URLOfModelInThisBundle { + NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"ane_minicpmv4_vit_f16" ofType:@"mlmodelc"]; + if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load ane_minicpmv4_vit_f16.mlmodelc in the bundle resource"); return nil; } + return [NSURL fileURLWithPath:assetPath]; +} + + +/** + Initialize ane_minicpmv4_vit_f16 instance from an existing MLModel object. + + Usually the application does not use this initializer unless it makes a subclass of ane_minicpmv4_vit_f16. + Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in. +*/ +- (instancetype)initWithMLModel:(MLModel *)model { + if (model == nil) { + return nil; + } + self = [super init]; + if (self != nil) { + _model = model; + } + return self; +} + + +/** + Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle. +*/ +- (nullable instancetype)init { + return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil]; +} + + +/** + Initialize ane_minicpmv4_vit_f16 instance with the model in this bundle. + + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error { + return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error]; +} + + +/** + Initialize ane_minicpmv4_vit_f16 instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16. + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error { + MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error]; + if (model == nil) { return nil; } + return [self initWithMLModel:model]; +} + + +/** + Initialize ane_minicpmv4_vit_f16 instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for ane_minicpmv4_vit_f16. + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error { + MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error]; + if (model == nil) { return nil; } + return [self initWithMLModel:model]; +} + + +/** + Construct ane_minicpmv4_vit_f16 instance asynchronously with configuration. + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object. +*/ ++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler { + [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle] + configuration:configuration + completionHandler:handler]; +} + + +/** + Construct ane_minicpmv4_vit_f16 instance asynchronously with URL of .mlmodelc directory and optional configuration. + + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param modelURL The model URL. + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid ane_minicpmv4_vit_f16 instance or NSError object. +*/ ++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(ane_minicpmv4_vit_f16 * _Nullable model, NSError * _Nullable error))handler { + [MLModel loadContentsOfURL:modelURL + configuration:configuration + completionHandler:^(MLModel *model, NSError *error) { + if (model != nil) { + ane_minicpmv4_vit_f16 *typedModel = [[ane_minicpmv4_vit_f16 alloc] initWithMLModel:model]; + handler(typedModel, nil); + } else { + handler(nil, error); + } + }]; +} + +- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error { + return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error]; +} + +- (nullable ane_minicpmv4_vit_f16Output *)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { + id outFeatures = [self.model predictionFromFeatures:input options:options error:error]; + if (!outFeatures) { return nil; } + return [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue]; +} + +- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler { + [self.model predictionFromFeatures:input completionHandler:^(id prediction, NSError *predictionError) { + if (prediction != nil) { + ane_minicpmv4_vit_f16Output *output = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue]; + completionHandler(output, predictionError); + } else { + completionHandler(nil, predictionError); + } + }]; +} + +- (void)predictionFromFeatures:(ane_minicpmv4_vit_f16Input *)input options:(MLPredictionOptions *)options completionHandler:(void (^)(ane_minicpmv4_vit_f16Output * _Nullable output, NSError * _Nullable error))completionHandler { + [self.model predictionFromFeatures:input options:options completionHandler:^(id prediction, NSError *predictionError) { + if (prediction != nil) { + ane_minicpmv4_vit_f16Output *output = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[prediction featureValueForName:@"output"].multiArrayValue]; + completionHandler(output, predictionError); + } else { + completionHandler(nil, predictionError); + } + }]; +} + +- (nullable ane_minicpmv4_vit_f16Output *)predictionFromInput:(MLMultiArray *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error { + ane_minicpmv4_vit_f16Input *input_ = [[ane_minicpmv4_vit_f16Input alloc] initWithInput:input]; + return [self predictionFromFeatures:input_ error:error]; +} + +- (nullable NSArray *)predictionsFromInputs:(NSArray *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { + id inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray]; + id outBatch = [self.model predictionsFromBatch:inBatch options:options error:error]; + if (!outBatch) { return nil; } + NSMutableArray *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count]; + for (NSInteger i = 0; i < outBatch.count; i++) { + id resultProvider = [outBatch featuresAtIndex:i]; + ane_minicpmv4_vit_f16Output * result = [[ane_minicpmv4_vit_f16Output alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue]; + [results addObject:result]; + } + return results; +} + +@end diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fdaf9738e88cb..8091f53bdcbfe 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -10,6 +10,9 @@ #include "ggml-alloc.h" #include "ggml-backend.h" #include "gguf.h" +#if defined(ENABLE_ANE) +#include "ane/ane.h" +#endif #include #include @@ -388,6 +391,9 @@ struct clip_ctx { // for debugging bool debug_graph = false; std::vector debug_print_tensors; + + // ANE model path for iOS + std::string ane_model_path; clip_ctx(clip_context_params & ctx_params) { debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr; @@ -867,8 +873,118 @@ struct clip_graph { int n_embd = clip_n_mmproj_embd(ctx); const int d_head = 128; int n_head = n_embd/d_head; + // Use actual config value if available, otherwise fall back to hardcoded values int num_query = ctx->model.hparams.minicpmv_query_num; + + ggml_tensor * Q = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), + model.mm_model_attn_q_b); + ggml_tensor * K = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), + model.mm_model_attn_k_b); + ggml_tensor * V = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), + model.mm_model_attn_v_b); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); + K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos); + V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos); + + cb(Q, "resampler_Q", -1); + cb(K, "resampler_K", -1); + cb(V, "resampler_V", -1); + + embeddings = build_attn( + model.mm_model_attn_o_w, + model.mm_model_attn_o_b, + Q, K, V, nullptr, kq_scale, -1); + cb(embeddings, "resampler_attn_out", -1); + } + // layernorm + embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); + + // projection + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; + } + + ggml_cgraph * build_minicpmv_embedding() { + const int batch_size = 1; + + GGML_ASSERT(model.class_embedding == nullptr); + const int n_pos = n_patches; + + // for selecting learned pos embd, used by ViT + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + + ggml_tensor * inp = build_inp(); + if (learned_pos_embd) { + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "pos_embed", -1); + } + ggml_tensor * embeddings = inp; + + // pre-layernorm + if (model.pre_ln_w) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "pre_ln"); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); + } + + ggml_build_forward_expand(gf, embeddings); + return gf; + } + + ggml_cgraph * build_minicpmv_resampler() { + const int batch_size = 1; + + GGML_ASSERT(model.class_embedding == nullptr); + const int n_pos = n_patches; + + const int image_size_width = img.nx; + const int image_size_height = img.ny; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + + // position embeddings for the projector (not for ViT) + int n_output_dim = clip_n_mmproj_embd(ctx); + ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size); + ggml_set_name(pos_embed, "pos_embed"); + ggml_set_input(pos_embed); + + struct ggml_tensor * embeddings = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1152, num_patches); + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); + + // resampler projector (it is just another transformer) + + ggml_tensor * q = model.mm_model_query; + ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + + // norm + q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); + v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1); + + // k = v + pos_embed + ggml_tensor * k = ggml_add(ctx0, v, pos_embed); + + // attention + { + int n_embd = clip_n_mmproj_embd(ctx); + const int d_head = 128; + int n_head = n_embd/d_head; + // Use actual config value if available, otherwise fall back to hardcoded values + int num_query = ctx->model.hparams.minicpmv_query_num; + ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); @@ -3095,9 +3211,10 @@ struct llava_uhd { const int original_width = original_size.width; const int original_height = original_size.height; - const bool has_slices = original_size.width > slice_size || original_size.height > slice_size; + bool has_slices = original_size.width > slice_size || original_size.height > slice_size; const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty(); + // has_slices = false; if (!has_slices) { // skip slicing logic res.overview_size = clip_image_size{slice_size, slice_size}; @@ -3723,15 +3840,362 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co return pos_embed_2d; } +#if defined(ENABLE_ANE) +static bool clip_image_encode_ane(float * data, float * vec, const char* ane_model_path) { + + static int flag = 0; + static const void* coremlEncoder = NULL; + static std::string cached_model_path = ""; + + // Check if we need to load a new model + if (flag == 0 || (ane_model_path && cached_model_path != ane_model_path)) { + if (coremlEncoder) { + closeModel(coremlEncoder); + } + coremlEncoder = loadModel(ane_model_path); + if (!coremlEncoder) { + printf("Failed to load ANE model from: %s\n", ane_model_path ? ane_model_path : "null"); + return false; + } + cached_model_path = ane_model_path ? ane_model_path : ""; + flag = 1; + } + predictWith(coremlEncoder, data, vec); + return true; +} +#endif + bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { clip_image_f32_batch imgs; clip_image_f32_ptr img_copy(clip_image_f32_init()); *img_copy = *img; imgs.entries.push_back(std::move(img_copy)); +#if defined(ENABLE_ANE) + bool ios_ctx = true; + if (ios_ctx){ + printf("clip use ane\n"); + float * vit_embedding1 = (float *)malloc(1100*1152*sizeof(float)); + float * vit_embedding2 = (float *)malloc(1100*1152*sizeof(float)); + + ane_embedding(ctx, n_threads, &imgs, vit_embedding1); + clip_image_encode_ane(vit_embedding1, vit_embedding2, ctx->ane_model_path.c_str()); + ane_resampler(ctx, n_threads, &imgs, vit_embedding2, vec); + free(vit_embedding1); + free(vit_embedding2); + return true; + } +#endif + return clip_image_batch_encode(ctx, n_threads, &imgs, vec); } +#if defined(ENABLE_ANE) +static bool ane_embedding(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { + const clip_image_f32_batch & imgs = *imgs_c_ptr; + int batch_size = imgs.entries.size(); + + // TODO @ngxson : implement batch size > 1 as a loop + // we don't need true batching support because the cgraph will gonna be big anyway + if (batch_size != 1) { + return false; // only support batch size of 1 + } + + // build the inference graph + ctx->debug_print_tensors.clear(); + ggml_backend_sched_reset(ctx->sched.get()); + GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); + clip_graph graph(ctx, *imgs.entries[0]); + ggml_cgraph * gf; + gf = graph.build_minicpmv_embedding(); + ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); + + // set inputs + const auto & model = ctx->model; + const auto & hparams = model.hparams; + + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int n_pos = num_patches + (model.class_embedding ? 1 : 0); + const int pos_w = image_size_width / patch_size; + const int pos_h = image_size_height / patch_size; + + auto get_inp_tensor = [&gf](const char * name) { + ggml_tensor * inp = ggml_graph_get_tensor(gf, name); + if (inp == nullptr) { + GGML_ABORT("Failed to get tensor %s", name); + } + if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ABORT("Tensor %s is not an input tensor", name); + } + return inp; + }; + + auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + // set input pixel values + if (!imgs.is_audio) { + size_t nelem = 0; + for (const auto & img : imgs.entries) { + nelem += img->nx * img->ny * 3; + } + std::vector inp_raw(nelem); + + // layout of data (note: the channel dim is unrolled to better visualize the layout): + // + // ┌──W──┐ + // │ H │ channel = R + // ├─────┤ │ + // │ H │ channel = G + // ├─────┤ │ + // │ H │ channel = B + // └─────┘ │ + // ──────┘ x B + + for (size_t i = 0; i < imgs.entries.size(); i++) { + const int nx = imgs.entries[i]->nx; + const int ny = imgs.entries[i]->ny; + const int n = nx * ny; + + for (int b = 0; b < batch_size; b++) { + float * batch_entry = inp_raw.data() + b * (3*n); + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + size_t base_src = 3*(y * nx + x); // idx of the first channel + size_t base_dst = y * nx + x; // idx of the first channel + batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; + batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; + batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; + } + } + } + } + set_input_f32("inp_raw", inp_raw); + + } else { + // audio input + GGML_ASSERT(imgs.entries.size() == 1); + const auto & mel_inp = imgs.entries[0]; + const int n_step = mel_inp->nx; + const int n_mel = mel_inp->ny; + std::vector inp_raw(n_step * n_mel); + std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float)); + set_input_f32("inp_raw", inp_raw); + } + + switch (ctx->model.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + std::vector positions(pos_h * pos_w); + int bucket_coords_h[1024]; + int bucket_coords_w[1024]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); + } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + set_input_i32("positions", positions); + } break; + default: + GGML_ABORT("Unknown projector type"); + } + + // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); + ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads); + } + } + + auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status); + return false; + } + + // print debug nodes + if (ctx->debug_graph) { + LOG_INF("\n\n---\n\n"); + LOG_INF("\n\nDebug graph:\n\n"); + for (ggml_tensor * t : ctx->debug_print_tensors) { + std::vector data(ggml_nbytes(t)); + ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); + print_tensor_shape(t); + print_tensor_data(t, data.data(), 3); + } + } + + // the last node is the embedding tensor + ggml_tensor * embeddings = ggml_graph_node(gf, -1); + + // sanity check (only support batch size of 1 for now) + const int n_tokens_out = embeddings->ne[1]; + + // copy the embeddings to the location passed by the user + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + + return true; +} + +static bool ane_resampler(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, const float * vit_embedding, float * vec) { + const clip_image_f32_batch & imgs = *imgs_c_ptr; + int batch_size = imgs.entries.size(); + + // TODO @ngxson : implement batch size > 1 as a loop + // we don't need true batching support because the cgraph will gonna be big anyway + if (batch_size != 1) { + return false; // only support batch size of 1 + } + + // build the inference graph + ctx->debug_print_tensors.clear(); + ggml_backend_sched_reset(ctx->sched.get()); + GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); + clip_graph graph(ctx, *imgs.entries[0]); + ggml_cgraph * gf; + gf = graph.build_minicpmv_resampler(); + ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); + + // set inputs + const auto & model = ctx->model; + const auto & hparams = model.hparams; + + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int n_pos = num_patches + (model.class_embedding ? 1 : 0); + const int pos_w = image_size_width / patch_size; + const int pos_h = image_size_height / patch_size; + + auto get_inp_tensor = [&gf](const char * name) { + ggml_tensor * inp = ggml_graph_get_tensor(gf, name); + if (inp == nullptr) { + GGML_ABORT("Failed to get tensor %s", name); + } + if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ABORT("Tensor %s is not an input tensor", name); + } + return inp; + }; + + auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); + ggml_backend_tensor_set(embeddings, vit_embedding, 0, ggml_nbytes(embeddings)); + + } + + switch (ctx->model.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + int embed_dim = clip_n_mmproj_embd(ctx); + + // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos? + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + + std::vector pos_embed(embed_dim * pos_w * pos_h); + for(int i = 0; i < pos_w * pos_h; ++i){ + for(int j = 0; j < embed_dim; ++j){ + pos_embed[i * embed_dim + j] = pos_embed_t[i][j]; + } + } + + set_input_f32("pos_embed", pos_embed); + } break; + default: + GGML_ABORT("Unknown projector type"); + } + + // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); + ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads); + } + } + + auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status); + return false; + } + + // print debug nodes + if (ctx->debug_graph) { + LOG_INF("\n\n---\n\n"); + LOG_INF("\n\nDebug graph:\n\n"); + for (ggml_tensor * t : ctx->debug_print_tensors) { + std::vector data(ggml_nbytes(t)); + ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); + print_tensor_shape(t); + print_tensor_data(t, data.data(), 3); + } + } + + // the last node is the embedding tensor + ggml_tensor * embeddings = ggml_graph_node(gf, -1); + + // sanity check (only support batch size of 1 for now) + const int n_tokens_out = embeddings->ne[1]; + const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get()); + if (n_tokens_out != expected_n_tokens_out) { + LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out); + GGML_ABORT("Invalid number of output tokens"); + } + + // copy the embeddings to the location passed by the user + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + + return true; +} +#endif + bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { const clip_image_f32_batch & imgs = *imgs_c_ptr; int batch_size = imgs.entries.size(); @@ -4209,3 +4673,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel batch->entries.push_back(clip_image_f32_ptr(audio)); batch->is_audio = true; } + +void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path) { + if (ctx && ane_model_path) { + ctx->ane_model_path = ane_model_path; + } +} diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 08f3efb7b1daf..f5524f2e33648 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -109,3 +109,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); bool clip_has_whisper_encoder(const struct clip_ctx * ctx); + +// ANE support functions +void clip_set_ane_model_path(struct clip_ctx * ctx, const char * ane_model_path); diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index 4dda60a21164b..be982a2d48909 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -824,13 +824,17 @@ def _replace_name(s, v): if re.match("vision_model.embeddings.position_embedding", s): v = v.unsqueeze(0) return {s: v} - - return {s: v} + print(s) + if "emb" in s: + return {s: v} + return None state_dict = model.state_dict() new_state_dict = {} for k, v in state_dict.items(): kvs = _replace_name(k, v) + if kvs is None: + continue for nk, nv in kvs.items(): new_state_dict[nk] = nv state_dict = new_state_dict diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 599e682e0f894..b2bc16b450116 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -132,6 +132,7 @@ struct mtmd_cli_context { mparams.print_timings = true; mparams.n_threads = params.cpuparams.n_threads; mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO; + mparams.ane_model_path = params.ane.path.empty() ? nullptr : params.ane.path.c_str(); ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams)); if (!ctx_vision.get()) { LOG_ERR("Failed to load vision model from %s\n", clip_path); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index a05373d5b3ca5..9b3f2292bf486 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -91,6 +92,7 @@ mtmd_context_params mtmd_context_params_default() { params.verbosity = GGML_LOG_LEVEL_INFO; params.image_marker = MTMD_DEFAULT_IMAGE_MARKER; params.media_marker = mtmd_default_marker(); + params.ane_model_path = nullptr; return params; } @@ -155,6 +157,19 @@ struct mtmd_context { auto res = clip_init(mmproj_fname, ctx_clip_params); ctx_v = res.ctx_v; ctx_a = res.ctx_a; + + // Set ANE model path for iOS + if (ctx_params.ane_model_path && ctx_v) { + // Check if ANE model file exists + std::ifstream ane_file(ctx_params.ane_model_path); + if (!ane_file.good()) { + throw std::runtime_error(string_format("ANE model file does not exist: %s", ctx_params.ane_model_path)); + } + ane_file.close(); + + clip_set_ane_model_path(ctx_v, ctx_params.ane_model_path); + LOG_INF("ANE model path set to: %s\n", ctx_params.ane_model_path); + } if (!ctx_v && !ctx_a) { throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index f4ea07d3ad521..331992e76f43e 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -82,6 +82,7 @@ struct mtmd_context_params { enum ggml_log_level verbosity; const char * image_marker; // deprecated, use media_marker instead const char * media_marker; + const char * ane_model_path; // path to ANE model for iOS }; MTMD_API const char * mtmd_default_marker(void); diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt index 3e68640933afb..ed0825d8eda69 100644 --- a/tools/perplexity/CMakeLists.txt +++ b/tools/perplexity/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-perplexity) add_executable(${TARGET} perplexity.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 47e5cbe30cfe3..a5575124aef20 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET llama-quantize) add_executable(${TARGET} quantize.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt index d0189596980eb..e351f1a1f30be 100644 --- a/tools/run/CMakeLists.txt +++ b/tools/run/CMakeLists.txt @@ -10,6 +10,6 @@ if (LLAMA_CURL) set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES}) endif () -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tokenize/CMakeLists.txt b/tools/tokenize/CMakeLists.txt index 1690b53e5d52b..f9dcd270d5f05 100644 --- a/tools/tokenize/CMakeLists.txt +++ b/tools/tokenize/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-tokenize) add_executable(${TARGET} tokenize.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt index c72bd814c3b31..da50e9bf848a8 100644 --- a/tools/tts/CMakeLists.txt +++ b/tools/tts/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-tts) add_executable(${TARGET} tts.cpp) -install(TARGETS ${TARGET} RUNTIME) +install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17)