Skip to content

Apple NPU acceleration integrated into llama.cpp, using MiniCPM-V 4.0 as an example. #15262

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions build-xcframework.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ TVOS_MIN_OS_VERSION=16.4

BUILD_SHARED_LIBS=OFF
LLAMA_BUILD_EXAMPLES=OFF
LLAMA_BUILD_TOOLS=OFF
LLAMA_BUILD_TOOLS=ON
LLAMA_BUILD_TESTS=OFF
LLAMA_BUILD_SERVER=OFF
GGML_METAL=ON
Expand Down Expand Up @@ -124,6 +124,10 @@ setup_framework_structure() {
cp ggml/include/ggml-cpu.h ${header_path}
cp ggml/include/ggml-blas.h ${header_path}
cp ggml/include/gguf.h ${header_path}
# Copy mtmd-ios headers and dependencies
cp tools/mtmd/mtmd-ios.h ${header_path}
cp tools/mtmd/mtmd.h ${header_path}
cp tools/mtmd/mtmd-helper.h ${header_path}

# Create module map (common for all platforms)
cat > ${module_path}module.modulemap << EOF
Expand All @@ -136,6 +140,9 @@ framework module llama {
header "ggml-cpu.h"
header "ggml-blas.h"
header "gguf.h"
header "mtmd-ios.h"
header "mtmd.h"
header "mtmd-helper.h"
link "c++"
link framework "Accelerate"
Expand Down Expand Up @@ -252,6 +259,8 @@ combine_static_libraries() {
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
"${base_dir}/${build_dir}/common/${release_dir}/libcommon.a"
"${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
)

# Create temporary directory for processing
Expand Down Expand Up @@ -327,7 +336,7 @@ combine_static_libraries() {
$arch_flags \
$min_version_flag \
-Wl,-force_load,"${temp_dir}/combined.a" \
-framework Foundation -framework Metal -framework Accelerate \
-framework Foundation -framework Metal -framework Accelerate -framework CoreML \
-install_name "$install_name" \
-o "${base_dir}/${output_lib}"

Expand Down
8 changes: 8 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
common_params_handle_model(params.ane, params.hf_token, "", params.offline);
break;
}
}
Expand Down Expand Up @@ -2263,6 +2264,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.mmproj_use_gpu = false;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--ane"}, "FILE",
"path to Apple Neural Engine model file for iOS",
[](common_params & params, const std::string & value) {
params.ane.path = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_ANE"));
add_opt(common_arg(
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
Expand Down
3 changes: 3 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,9 @@ struct common_params {
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)

// Apple Neural Engine support
struct common_params_model ane;

// embedding
bool embedding = false; // get only sentence embedding
Expand Down
2 changes: 1 addition & 1 deletion tools/batched-bench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-batched-bench)
add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
2 changes: 1 addition & 1 deletion tools/cvector-generator/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-cvector-generator)
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
install(TARGETS ${TARGET} RUNTIME)
install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
2 changes: 1 addition & 1 deletion tools/export-lora/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-export-lora)
add_executable(${TARGET} export-lora.cpp)
install(TARGETS ${TARGET} RUNTIME)
install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
2 changes: 1 addition & 1 deletion tools/gguf-split/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-gguf-split)
add_executable(${TARGET} gguf-split.cpp)
install(TARGETS ${TARGET} RUNTIME)
install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
2 changes: 1 addition & 1 deletion tools/imatrix/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-imatrix)
add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME)
install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
2 changes: 1 addition & 1 deletion tools/llama-bench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-bench)
add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
2 changes: 1 addition & 1 deletion tools/main/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-cli)
add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME)
install(TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
44 changes: 42 additions & 2 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

find_package(Threads REQUIRED)

# ANE support option
option(ENABLE_ANE "Enable Apple Neural Engine support" OFF)

add_library(mtmd
mtmd.cpp
mtmd-audio.cpp
Expand All @@ -13,13 +16,43 @@ add_library(mtmd
mtmd-helper.h
)

target_link_libraries (mtmd PUBLIC ggml llama)
# Add ANE related files when enabled
if(ENABLE_ANE)
target_sources(mtmd PRIVATE
ane/ane.h
ane/ane.mm
ane/ane_minicpmv4_vit_f16.h
ane/ane_minicpmv4_vit_f16.m
)
# Define compile-time macro for code guards
target_compile_definitions(mtmd PRIVATE ENABLE_ANE)

# Enable ARC for Objective-C files
set_source_files_properties(ane/ane.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
set_source_files_properties(ane/ane_minicpmv4_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
endif()

target_link_libraries (mtmd PUBLIC ggml llama common)
target_link_libraries (mtmd PRIVATE Threads::Threads)
target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../common)
target_include_directories(mtmd PRIVATE ../../include)
target_include_directories(mtmd PRIVATE ../../ggml/include)
target_include_directories(mtmd PRIVATE ../../src)
target_include_directories(mtmd PRIVATE ../../vendor)
target_compile_features (mtmd PRIVATE cxx_std_17)

# Link CoreML and Accelerate frameworks when ANE is enabled
if(ENABLE_ANE)
target_link_libraries(mtmd PRIVATE
"-framework Foundation"
"-framework CoreML"
"-framework Accelerate"
"-ObjC"
)
endif()

if (BUILD_SHARED_LIBS)
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
Expand All @@ -31,6 +64,13 @@ set(MTMD_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
)

# Add ANE public headers when enabled
if(ENABLE_ANE)
list(APPEND MTMD_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/ane/ane.h
)
endif()

set_target_properties(mtmd
PROPERTIES
PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
Expand All @@ -55,6 +95,6 @@ add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
set(TARGET llama-mtmd-cli)
add_executable (${TARGET} mtmd-cli.cpp)
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
install (TARGETS ${TARGET} RUNTIME)
install (TARGETS ${TARGET} RUNTIME BUNDLE DESTINATION .)
target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
11 changes: 11 additions & 0 deletions tools/mtmd/ane/ane.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#if __cplusplus
extern "C" {
#endif

const void* loadModel(const char* model_path);
void closeModel(const void* model);
void predictWith(const void* model, float* embed, float* encoderOutput);

#if __cplusplus
} // Extern C
#endif
78 changes: 78 additions & 0 deletions tools/mtmd/ane/ane.mm
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#import <CoreML/CoreML.h>
#import <Accelerate/Accelerate.h>
#import "ane.h"
#import "ane_minicpmv4_vit_f16.h"
#include <stdlib.h>

#if __cplusplus
extern "C" {
#endif

const void* loadModel(const char* model_path) {
if (!model_path) {
NSLog(@"Error: model_path is null");
return nullptr;
}

NSString *pathString = [NSString stringWithUTF8String:model_path];

// Check if file exists
NSFileManager *fileManager = [NSFileManager defaultManager];
if (![fileManager fileExistsAtPath:pathString]) {
NSLog(@"Error: ANE model file does not exist at path: %@", pathString);
return nullptr;
}

// Check if it's a directory (for .mlmodelc packages)
BOOL isDirectory;
if ([fileManager fileExistsAtPath:pathString isDirectory:&isDirectory]) {
if (!isDirectory && ![pathString hasSuffix:@".mlmodelc"]) {
NSLog(@"Warning: ANE model path should typically be a .mlmodelc directory: %@", pathString);
}
}

NSURL *modelURL = [NSURL fileURLWithPath:pathString];

NSLog(@"Loading ANE model from: %@", modelURL.absoluteString);

NSError *error = nil;
const void* model = CFBridgingRetain([[ane_minicpmv4_vit_f16 alloc] initWithContentsOfURL:modelURL error:&error]);

if (error) {
NSLog(@"Error loading ANE model: %@", error.localizedDescription);
return nullptr;
}

if (!model) {
NSLog(@"Error: Failed to create ANE model instance");
return nullptr;
}

NSLog(@"Successfully loaded ANE model from: %@", pathString);
return model;
}

void predictWith(const void* model, float* embed, float* encoderOutput) {
MLMultiArray *inMultiArray = [[MLMultiArray alloc] initWithDataPointer: embed
shape: @[@1, @1024, @1152]
dataType: MLMultiArrayDataTypeFloat32
strides: @[@(1179648), @(1152), @1]
deallocator: nil
error: nil];

ane_minicpmv4_vit_f16Output *modelOutput = [(__bridge id)model predictionFromInput:inMultiArray error:nil];

MLMultiArray *outMA = modelOutput.output;

cblas_scopy((int)outMA.count,
(float*)outMA.dataPointer, 1,
encoderOutput, 1);
}

void closeModel(const void* model) {
CFRelease(model);
}

#if __cplusplus
} //Extern C
#endif
Loading
Loading