Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions build-xcframework.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ TVOS_MIN_OS_VERSION=16.4

BUILD_SHARED_LIBS=OFF
LLAMA_BUILD_EXAMPLES=OFF
LLAMA_BUILD_TOOLS=OFF
LLAMA_BUILD_TOOLS=ON
LLAMA_BUILD_TESTS=OFF
LLAMA_BUILD_SERVER=OFF
GGML_METAL=ON
Expand Down Expand Up @@ -124,6 +124,10 @@ setup_framework_structure() {
cp ggml/include/ggml-cpu.h ${header_path}
cp ggml/include/ggml-blas.h ${header_path}
cp ggml/include/gguf.h ${header_path}
# Copy mtmd-ios headers and dependencies
cp tools/mtmd/mtmd-ios.h ${header_path}
cp tools/mtmd/mtmd.h ${header_path}
cp tools/mtmd/mtmd-helper.h ${header_path}

# Create module map (common for all platforms)
cat > ${module_path}module.modulemap << EOF
Expand All @@ -136,6 +140,9 @@ framework module llama {
header "ggml-cpu.h"
header "ggml-blas.h"
header "gguf.h"
header "mtmd-ios.h"
header "mtmd.h"
header "mtmd-helper.h"

link "c++"
link framework "Accelerate"
Expand Down Expand Up @@ -252,6 +259,8 @@ combine_static_libraries() {
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
"${base_dir}/${build_dir}/common/${release_dir}/libcommon.a"
"${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
)

# Create temporary directory for processing
Expand Down Expand Up @@ -327,7 +336,7 @@ combine_static_libraries() {
$arch_flags \
$min_version_flag \
-Wl,-force_load,"${temp_dir}/combined.a" \
-framework Foundation -framework Metal -framework Accelerate \
-framework Foundation -framework Metal -framework Accelerate -framework CoreML \
-install_name "$install_name" \
-o "${base_dir}/${output_lib}"

Expand Down
10 changes: 10 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1178,6 +1178,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
common_params_handle_model(params.coreml, params.hf_token, "", params.offline);
break;
}
}
Expand Down Expand Up @@ -2522,6 +2523,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.mmproj_use_gpu = false;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
// CoreML model path (new)
add_opt(common_arg(
{"--coreml"}, "FILE",
"path to CoreML model file",
[](common_params & params, const std::string & value) {
params.coreml.path = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_COREML"));

add_opt(common_arg(
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
Expand Down
3 changes: 3 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,9 @@ struct common_params {
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)

// Apple Neural Engine support
struct common_params_model coreml;

// finetune
struct lr_opt lr;
Expand Down
42 changes: 41 additions & 1 deletion tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

find_package(Threads REQUIRED)

# CoreML support option
option(ENABLE_COREML "Enable CoreML support" OFF)

add_library(mtmd
mtmd.cpp
mtmd-audio.cpp
Expand All @@ -13,13 +16,43 @@ add_library(mtmd
mtmd-helper.h
)

target_link_libraries (mtmd PUBLIC ggml llama)
# Add CoreML related files when enabled
if(ENABLE_COREML)
target_sources(mtmd PRIVATE
coreml/mtmd_coreml.h
coreml/mtmd_coreml.mm
coreml/coreml_minicpmv40_vit_f16.h
coreml/coreml_minicpmv40_vit_f16.m
)
# Define compile-time macro for code guards
target_compile_definitions(mtmd PRIVATE ENABLE_COREML)

# Enable ARC for Objective-C files
set_source_files_properties(coreml/mtmd_coreml.mm PROPERTIES COMPILE_FLAGS "-fobjc-arc")
set_source_files_properties(coreml/coreml_minicpmv40_vit_f16.m PROPERTIES COMPILE_FLAGS "-fobjc-arc")
endif()

target_link_libraries (mtmd PUBLIC ggml llama common)
target_link_libraries (mtmd PRIVATE Threads::Threads)
target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../common)
target_include_directories(mtmd PRIVATE ../../include)
target_include_directories(mtmd PRIVATE ../../ggml/include)
target_include_directories(mtmd PRIVATE ../../src)
target_include_directories(mtmd PRIVATE ../../vendor)
target_compile_features (mtmd PRIVATE cxx_std_17)

# Link CoreML and Accelerate frameworks when CoreML is enabled
if(ENABLE_COREML)
target_link_libraries(mtmd PRIVATE
"-framework Foundation"
"-framework CoreML"
"-framework Accelerate"
"-ObjC"
)
endif()

if (BUILD_SHARED_LIBS)
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
Expand All @@ -31,6 +64,13 @@ set(MTMD_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
)

# Add CoreML public headers when enabled
if(ENABLE_COREML)
list(APPEND MTMD_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/coreml/coreml.h
)
endif()

set_target_properties(mtmd
PROPERTIES
PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
Expand Down
143 changes: 143 additions & 0 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
#if defined(ENABLE_COREML)
#include "coreml/mtmd_coreml.h"
#endif

#include <cassert>
#include <cmath>
Expand Down Expand Up @@ -390,6 +393,9 @@ struct clip_ctx {
bool debug_graph = false;
std::vector<ggml_tensor *> debug_print_tensors;

// CoreML model path for iOS
std::string coreml_model_path;

clip_ctx(clip_context_params & ctx_params) {
debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
Expand Down Expand Up @@ -3930,15 +3936,146 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
return pos_embed_2d;
}

#if defined(ENABLE_COREML)
static bool clip_image_encode_coreml(float * pixel_values, int32_t * position_ids, float * pos_embed, float * vec, const char* coreml_model_path) {

static int flag = 0;
static const void* coremlEncoder = NULL;
static std::string cached_model_path = "";

// Check if we need to load a new model
if (flag == 0 || (coreml_model_path && cached_model_path != coreml_model_path)) {
if (coremlEncoder) {
closeModel(coremlEncoder);
}
coremlEncoder = loadModel(coreml_model_path);
if (!coremlEncoder) {
printf("Failed to load CoreML model from: %s\n", coreml_model_path ? coreml_model_path : "null");
return false;
}
cached_model_path = coreml_model_path ? coreml_model_path : "";
flag = 1;
}
predictWith(coremlEncoder, pixel_values, position_ids, pos_embed, vec);
return true;
}
#endif

bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
clip_image_f32_batch imgs;
clip_image_f32_ptr img_copy(clip_image_f32_init());
*img_copy = *img;
imgs.entries.push_back(std::move(img_copy));

#if defined(ENABLE_COREML)
const bool can_use_coreml =
!ctx->coreml_model_path.empty() &&
ctx->model.modality == CLIP_MODALITY_VISION &&
ctx->proj_type() == PROJECTOR_TYPE_MINICPMV;
if (can_use_coreml){
printf("clip use coreml\n");
return clip_image_batch_encode_coreml(ctx, &imgs, vec);
}
#endif
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
}

bool clip_image_batch_encode_coreml(clip_ctx * ctx, const clip_image_f32_batch * imgs_c_ptr, float * vec) {

const clip_image_f32_batch & imgs = *imgs_c_ptr;
int batch_size = imgs.entries.size();

if (batch_size != 1) {
return false; // only support batch size of 1
}
// set inputs
const auto & model = ctx->model;
const auto & hparams = model.hparams;

const int image_size_width = imgs.entries[0]->nx;
const int image_size_height = imgs.entries[0]->ny;
const int patch_size = hparams.patch_size;
const int pos_w = image_size_width / patch_size;
const int pos_h = image_size_height / patch_size;

switch (ctx->model.proj_type) {
case PROJECTOR_TYPE_MINICPMV:
{
std::vector<float> inp_raw;
std::vector<int32_t> positions;
std::vector<float> pos_embed;

// prepare inp_raw
{
const int max_patches = 1024;
const int nx = max_patches * patch_size;
const int ny = patch_size;
const int n = nx * ny;
inp_raw.assign(3 * n, 0.0f);

int patch_index = 0;
for (int i = 0; i < image_size_height && patch_index < max_patches; i += patch_size) {
for (int j = 0; j < image_size_width && patch_index < max_patches; j += patch_size) {
for (int pi = 0; pi < patch_size; ++pi) {
for (int pj = 0; pj < patch_size; ++pj) {
int src_index = ((i + pi) * image_size_width + (j + pj)) * 3;
int dst_index = nx * pi + patch_index * patch_size + pj;
inp_raw[dst_index] = imgs.entries[0]->buf[src_index];
inp_raw[n + dst_index] = imgs.entries[0]->buf[src_index + 1];
inp_raw[2 * n + dst_index] = imgs.entries[0]->buf[src_index + 2];
}
}
patch_index++;
}
}
}
// prepare position_ids
{
// inspired from siglip:
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
positions.assign(std::max(pos_h * pos_w, 1024),0);
int bucket_coords_h[1024];
int bucket_coords_w[1024];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
}
}
// prepare pos_embed
{
// inspired from resampler of Qwen-VL:
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
int embed_dim = clip_n_mmproj_embd(ctx);

// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));

pos_embed.assign(embed_dim * std::max(pos_w * pos_h, 1024), 0.0f);
for(int i = 0; i < pos_w * pos_h; ++i){
for(int j = 0; j < embed_dim; ++j){
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
}
}
}

#if defined(ENABLE_COREML)
return clip_image_encode_coreml(inp_raw.data(), positions.data(), pos_embed.data(), vec, ctx->coreml_model_path.c_str());
#endif
}
default:
GGML_ABORT("Unknown projector type");
}
}

bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
int batch_size = imgs.entries.size();
Expand Down Expand Up @@ -4421,3 +4558,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
batch->entries.push_back(clip_image_f32_ptr(audio));
batch->is_audio = true;
}

void clip_set_coreml_model_path(struct clip_ctx * ctx, const char * coreml_model_path) {
if (ctx && coreml_model_path) {
ctx->coreml_model_path = coreml_model_path;
}
}
4 changes: 4 additions & 0 deletions tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
bool clip_image_batch_encode_coreml(struct clip_ctx * ctx, const struct clip_image_f32_batch * imgs, float * vec);

int clip_is_minicpmv(const struct clip_ctx * ctx);
bool clip_is_glm(const struct clip_ctx * ctx);
Expand All @@ -104,3 +105,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);

// CoreML support functions
void clip_set_coreml_model_path(struct clip_ctx * ctx, const char * coreml_model_path);
Loading
Loading