diff --git a/common/arg.cpp b/common/arg.cpp index 406fbc2f06f..7064387cbb7 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2338,6 +2338,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.mmproj_use_gpu = false; } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD")); + add_opt(common_arg( + {"--mmproj-backend"}, "NAME", + "GPU backend for multimodal projector (e.g. CUDA, Metal, Vulkan)\n" + "if not specified, will use MTMD_BACKEND_DEVICE env var or default GPU backend", + [](common_params & params, const std::string & value) { + params.mmproj_backend = value; + } + ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--image", "--audio"}, "FILE", "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n", diff --git a/common/common.h b/common/common.h index e7bc2b44a2a..613491f94eb 100644 --- a/common/common.h +++ b/common/common.h @@ -402,6 +402,7 @@ struct common_params { // multimodal models (see tools/mtmd) struct common_params_model mmproj; bool mmproj_use_gpu = true; // use GPU for multimodal model + std::string mmproj_backend = ""; // GPU backend for multimodal model (e.g. "CUDA", "Metal", "Vulkan") bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f0506a11cef..38f4baa03f1 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -28,11 +28,7 @@ #include #include -struct clip_logger_state g_logger_state = { - GGML_LOG_LEVEL_CONT, // verbosity_thold - clip_log_callback_default, // log_callback - NULL // log_callback_user_data -}; +struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; enum ffn_op_type { FFN_GELU, @@ -401,7 +397,7 @@ struct clip_ctx { throw std::runtime_error("failed to initialize CPU backend"); } if (ctx_params.use_gpu) { - auto backend_name = std::getenv("MTMD_BACKEND_DEVICE"); + auto backend_name = ctx_params.backend_device ? ctx_params.backend_device : std::getenv("MTMD_BACKEND_DEVICE"); if (backend_name != nullptr) { backend = ggml_backend_init_by_name(backend_name, nullptr); if (!backend) { diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 3387cdbd369..455b0b5baf7 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -25,6 +25,7 @@ enum clip_modality { struct clip_context_params { bool use_gpu; enum ggml_log_level verbosity; + const char * backend_device; // optional, if null will use env var or default GPU backend }; struct clip_init_result { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 6708bc519e0..c7060d831d1 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -91,6 +91,7 @@ mtmd_context_params mtmd_context_params_default() { params.verbosity = GGML_LOG_LEVEL_INFO; params.image_marker = MTMD_DEFAULT_IMAGE_MARKER; params.media_marker = mtmd_default_marker(); + params.backend_device = nullptr; return params; } @@ -152,6 +153,7 @@ struct mtmd_context { clip_context_params ctx_clip_params; ctx_clip_params.use_gpu = ctx_params.use_gpu; ctx_clip_params.verbosity = ctx_params.verbosity; + ctx_clip_params.backend_device = ctx_params.backend_device; auto res = clip_init(mmproj_fname, ctx_clip_params); ctx_v = res.ctx_v; ctx_a = res.ctx_a; diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index d743e52f5a4..3bb44775090 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -82,6 +82,7 @@ struct mtmd_context_params { enum ggml_log_level verbosity; const char * image_marker; // deprecated, use media_marker instead const char * media_marker; + const char * backend_device; // optional GPU backend name (e.g. "CUDA", "Metal", "Vulkan"), if null will use env var or default }; MTMD_API const char * mtmd_default_marker(void);