Skip to content
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Checks: >
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
-readability-math-missing-parentheses,
-readability-braces-around-statements,
-readability-isolate-declaration,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
Expand Down
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2768,6 +2768,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MTMD}));
add_opt(common_arg(
{"--video"}, "PATH",
"path to a video file (requires FFmpeg at build time) or a directory of frames; can be repeated.\n",
[](common_params & params, const std::string & value) {
params.video.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MTMD}));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ struct common_params {
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
std::vector<std::string> video; // path to video file(s) or frame directories

// finetune
struct lr_opt lr;
Expand Down
25 changes: 25 additions & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ find_package(Threads REQUIRED)
add_library(mtmd
mtmd.cpp
mtmd-audio.cpp
mtmd-video.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
mtmd-helper.cpp
mtmd-helper.h
mtmd-video.h
)

target_link_libraries (mtmd PUBLIC ggml llama)
Expand All @@ -20,6 +22,28 @@ target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../vendor)
target_compile_features (mtmd PRIVATE cxx_std_17)

# Optional FFmpeg support for video decoding
option(MTMD_WITH_FFMPEG "Enable FFmpeg-based video decoding in mtmd-video" OFF)
if (MTMD_WITH_FFMPEG)
find_package(PkgConfig QUIET)
if (PKG_CONFIG_FOUND)
pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil)
if (FFMPEG_FOUND)
target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG)
target_compile_definitions(mtmd PRIVATE MTMD_WITH_FFMPEG)
else()
message(WARNING "FFmpeg not found via pkg-config; MTMD_WITH_FFMPEG disabled")
endif()
else()
message(WARNING "pkg-config not found; MTMD_WITH_FFMPEG disabled")
endif()
endif()

option(MTMD_MAX_VIDEO_FRAMES_SMALL "Set a small number of frames for fast test locally" OFF)
if(MTMD_MAX_VIDEO_FRAMES_SMALL)
target_compile_definitions(mtmd PRIVATE MTMD_MAX_VIDEO_FRAMES_SMALL)
endif()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be added as an env var instead. See MTMD_DEBUG_GRAPH as an example

Copy link
Author

@MrAMS MrAMS Nov 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This issue has already been fixed in commit ef68f2a8bbf60e.


if (BUILD_SHARED_LIBS)
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
Expand All @@ -29,6 +53,7 @@ endif()
set(MTMD_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-video.h
)

set_target_properties(mtmd
Expand Down
116 changes: 64 additions & 52 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ struct clip_hparams {
// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
int minicpmv_max_slice_nums = 9;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Search for max_slice_nums, it's already present in the code base.

int32_t minicpmv_query_num = 0; // MiniCPM-V query number
};

Expand Down Expand Up @@ -3639,16 +3640,67 @@ struct llava_uhd {
const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();

if (!has_slices) {
// skip slicing logic
res.overview_size = clip_image_size{slice_size, slice_size};
res.refined_size = clip_image_size{0, 0};
res.grid_size = clip_image_size{0, 0};
if (clip_is_minicpmv(ctx)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the change here compared to the initial version?

IMO I think this is more like a duplicated logic, see clip_image_preprocess on how this is used by minicpm-v

auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
res.overview_size = best_size;

{
const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums;
const float log_ratio = log((float)original_width / original_height);
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
const int multiple = fmin(ceil(ratio), max_slice_nums);

auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
res.grid_size = best_grid;
res.refined_size = refine_size;

LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
__func__, original_width, original_height,
res.overview_size.width, res.overview_size.height,
res.refined_size.width, res.refined_size.height,
res.grid_size.width, res.grid_size.height);

if (!has_slices || max_slice_nums == 0) {
return res;
}

int width = refine_size.width;
int height = refine_size.height;
int grid_x = int(width / best_grid.width);
int grid_y = int(height / best_grid.height);
for (int patches_y = 0, ic = 0;
patches_y < refine_size.height && ic < best_grid.height;
patches_y += grid_y, ic += 1) {
for (int patches_x = 0, jc = 0;
patches_x < refine_size.width && jc < best_grid.width;
patches_x += grid_x, jc += 1) {
slice_coordinates slice;
slice.x = patches_x;
slice.y = patches_y;
slice.size.width = grid_x;
slice.size.height = grid_y;
res.slices.push_back(slice);
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
__func__, (int)res.slices.size() - 1,
slice.x, slice.y, slice.size.width, slice.size.height);
}
}
}

return res;
}
else {
if (!has_slices) {
// skip slicing logic
res.overview_size = clip_image_size{slice_size, slice_size};
res.refined_size = clip_image_size{0, 0};
res.grid_size = clip_image_size{0, 0};

if (has_pinpoints) {
return res;
}

if (has_pinpoints) {
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
auto refine_size = llava_uhd::select_best_resolution(
original_size,
Expand Down Expand Up @@ -3684,53 +3736,7 @@ struct llava_uhd {

return res;
}

// no pinpoints, dynamically calculate the grid size (e.g. minicpmv)

auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
res.overview_size = best_size;

{
const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
const float log_ratio = log((float)original_width / original_height);
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
const int multiple = fmin(ceil(ratio), max_slice_nums);

auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
res.grid_size = best_grid;
res.refined_size = refine_size;

LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
__func__, original_width, original_height,
res.overview_size.width, res.overview_size.height,
res.refined_size.width, res.refined_size.height,
res.grid_size.width, res.grid_size.height);

int width = refine_size.width;
int height = refine_size.height;
int grid_x = int(width / best_grid.width);
int grid_y = int(height / best_grid.height);
for (int patches_y = 0, ic = 0;
patches_y < refine_size.height && ic < best_grid.height;
patches_y += grid_y, ic += 1) {
for (int patches_x = 0, jc = 0;
patches_x < refine_size.width && jc < best_grid.width;
patches_x += grid_x, jc += 1) {
slice_coordinates slice;
slice.x = patches_x;
slice.y = patches_y;
slice.size.width = grid_x;
slice.size.height = grid_y;
res.slices.push_back(slice);
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
__func__, (int)res.slices.size() - 1,
slice.x, slice.y, slice.size.width, slice.size.height);
}
}
}

return res;
}

static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
Expand Down Expand Up @@ -4836,6 +4842,12 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
}

void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n) {
if (!ctx) return;
if (n < 0) n = 0;
ctx->model.hparams.minicpmv_max_slice_nums = n;
}

bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_i
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);

int clip_is_minicpmv(const struct clip_ctx * ctx);
void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n);
bool clip_is_glm(const struct clip_ctx * ctx);
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
bool clip_is_llava(const struct clip_ctx * ctx);
Expand Down
48 changes: 35 additions & 13 deletions tools/mtmd/mtmd-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
#include "ggml.h"
#include "console.h"
#include "chat.h"
#include "clip.h"
#include "mtmd.h"
#include "mtmd-helper.h"
#include "mtmd-video.h"

#include <vector>
#include <limits.h>
#include <cinttypes>
#include <cstdlib>

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
Expand Down Expand Up @@ -154,8 +157,8 @@ struct mtmd_cli_context {
);
}

bool load_media(const std::string & fname) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
bool load_media(const std::string & path) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), path.c_str()));
if (!bmp.ptr) {
return false;
}
Expand Down Expand Up @@ -284,7 +287,7 @@ int main(int argc, char ** argv) {
mtmd_cli_context ctx(params);
LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());

bool is_single_turn = !params.prompt.empty() && !params.image.empty();
bool is_single_turn = !params.prompt.empty() && (!params.image.empty() || !params.video.empty());

int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;

Expand All @@ -308,19 +311,34 @@ int main(int argc, char ** argv) {

if (is_single_turn) {
g_is_generating = true;
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
for (size_t i = 0; i < params.image.size(); i++) {
params.prompt += mtmd_default_marker();
}
}
common_chat_msg msg;
msg.role = "user";
msg.content = params.prompt;

// 1) load all media first
size_t n_loaded_media = 0;
for (const auto & image : params.image) {
if (!ctx.load_media(image)) {
return 1; // error is already printed by libmtmd
}
n_loaded_media += 1;
}
for (const auto & vpath : params.video) {
if (!ctx.load_media(vpath)) {
return 1; // error is already printed by libmtmd
}
n_loaded_media += 1;
}

// 2) build prompt content with correct number of markers
std::string prompt_content = params.prompt;
if (prompt_content.find(mtmd_default_marker()) == std::string::npos) {
for (size_t i = 0; i < n_loaded_media; i++) {
prompt_content += mtmd_default_marker();
}
}

// 3) run
common_chat_msg msg;
msg.role = "user";
msg.content = prompt_content;
if (eval_message(ctx, msg)) {
return 1;
}
Expand All @@ -336,6 +354,9 @@ int main(int argc, char ** argv) {
if (mtmd_support_audio(ctx.ctx_vision.get())) {
LOG("\n /audio <path> load an audio");
}
if (mtmd_support_vision(ctx.ctx_vision.get())) {
LOG("\n /video <path> load a video");
}
LOG("\n /clear clear the chat history");
LOG("\n /quit or /exit exit the program");
LOG("\n");
Expand Down Expand Up @@ -367,14 +388,15 @@ int main(int argc, char ** argv) {
g_is_generating = true;
bool is_image = line == "/image" || line.find("/image ") == 0;
bool is_audio = line == "/audio" || line.find("/audio ") == 0;
if (is_image || is_audio) {
bool is_video = line == "/video" || line.find("/video ") == 0;
if (is_image || is_audio || is_video) {
if (line.size() < 8) {
LOG_ERR("ERR: Missing media filename\n");
continue;
}
std::string media_path = line.substr(7);
if (ctx.load_media(media_path)) {
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : (is_audio ? "audio" : "video"));
content += mtmd_default_marker();
}
// else, error is already printed by libmtmd
Expand Down
Loading