ggml-org · MrAMS · Oct 14, 2025 · Oct 28, 2025 · Oct 29, 2025 · Oct 29, 2025
@@ -14,6 +14,8 @@ Checks: >
     -readability-uppercase-literal-suffix,
     -readability-simplify-boolean-expr,
     -readability-math-missing-parentheses,
+    -readability-braces-around-statements,
+    -readability-isolate-declaration,
     clang-analyzer-*,
     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
     performance-*,

@@ -2768,6 +2768,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MTMD}));
+    add_opt(common_arg(
+        {"--video"}, "PATH",
+        "path to a video file (requires FFmpeg at build time) or a directory of frames; can be repeated.\n",
+        [](common_params & params, const std::string & value) {
+            params.video.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",

@@ -406,6 +406,7 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+    std::vector<std::string> video; // path to video file(s) or frame directories
 
     // finetune
     struct lr_opt lr;

@@ -5,12 +5,14 @@ find_package(Threads REQUIRED)
 add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
+            mtmd-video.cpp
             mtmd.h
             clip.cpp
             clip.h
             clip-impl.h
             mtmd-helper.cpp
             mtmd-helper.h
+            mtmd-video.h
             )
 
 target_link_libraries     (mtmd PUBLIC ggml llama)
@@ -20,6 +22,28 @@ target_include_directories(mtmd PRIVATE ../..)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
+# Optional FFmpeg support for video decoding
+option(MTMD_WITH_FFMPEG "Enable FFmpeg-based video decoding in mtmd-video" OFF)
+if (MTMD_WITH_FFMPEG)
+    find_package(PkgConfig QUIET)
+    if (PKG_CONFIG_FOUND)
+        pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil)
+        if (FFMPEG_FOUND)
+            target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG)
+            target_compile_definitions(mtmd PRIVATE MTMD_WITH_FFMPEG)
+        else()
+            message(WARNING "FFmpeg not found via pkg-config; MTMD_WITH_FFMPEG disabled")
+        endif()
+    else()
+        message(WARNING "pkg-config not found; MTMD_WITH_FFMPEG disabled")
+    endif()
+endif()
+
+option(MTMD_MAX_VIDEO_FRAMES_SMALL "Set a small number of frames for fast test locally" OFF)
+if(MTMD_MAX_VIDEO_FRAMES_SMALL)
+    target_compile_definitions(mtmd PRIVATE MTMD_MAX_VIDEO_FRAMES_SMALL)
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -29,6 +53,7 @@ endif()
 set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-video.h
     )
 
 set_target_properties(mtmd

@@ -203,6 +203,7 @@ struct clip_hparams {
     // legacy
     bool has_llava_projector = false;
     int minicpmv_version = 0;
+    int minicpmv_max_slice_nums = 9;
     int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
 };
 
@@ -3639,16 +3640,67 @@ struct llava_uhd {
         const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
         const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
 
-        if (!has_slices) {
-            // skip slicing logic
-            res.overview_size = clip_image_size{slice_size, slice_size};
-            res.refined_size  = clip_image_size{0, 0};
-            res.grid_size     = clip_image_size{0, 0};
+        if (clip_is_minicpmv(ctx)) {
+            auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
+            res.overview_size = best_size;
+
+            {
+                const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums;
+                const float log_ratio = log((float)original_width / original_height);
+                const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+                const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+                auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
+                auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
+                res.grid_size    = best_grid;
+                res.refined_size = refine_size;
+
+                LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                        __func__, original_width, original_height,
+                        res.overview_size.width, res.overview_size.height,
+                        res.refined_size.width, res.refined_size.height,
+                        res.grid_size.width, res.grid_size.height);
+
+                if (!has_slices || max_slice_nums == 0) {
+                    return res;
+                }
+
+                int width  = refine_size.width;
+                int height = refine_size.height;
+                int grid_x = int(width  / best_grid.width);
+                int grid_y = int(height / best_grid.height);
+                for (int patches_y = 0,                    ic = 0;
+                        patches_y < refine_size.height && ic < best_grid.height;
+                        patches_y += grid_y,              ic += 1) {
+                    for (int patches_x = 0,                   jc = 0;
+                            patches_x < refine_size.width && jc < best_grid.width;
+                            patches_x += grid_x,             jc += 1) {
+                        slice_coordinates slice;
+                        slice.x = patches_x;
+                        slice.y = patches_y;
+                        slice.size.width  = grid_x;
+                        slice.size.height = grid_y;
+                        res.slices.push_back(slice);
+                        LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                                __func__, (int)res.slices.size() - 1,
+                                slice.x, slice.y, slice.size.width, slice.size.height);
+                    }
+                }
+            }
 
             return res;
         }
+        else {
+            if (!has_slices) {
+                // skip slicing logic
+                res.overview_size = clip_image_size{slice_size, slice_size};
+                res.refined_size  = clip_image_size{0, 0};
+                res.grid_size     = clip_image_size{0, 0};
 
-        if (has_pinpoints) {
+                return res;
+            }
+
+            if (has_pinpoints) {
             // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
             auto refine_size = llava_uhd::select_best_resolution(
                 original_size,
@@ -3684,53 +3736,7 @@ struct llava_uhd {
 
             return res;
         }
-
-        // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
-
-        auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
-        res.overview_size = best_size;
-
-        {
-            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
-            const float log_ratio = log((float)original_width / original_height);
-            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
-            const int multiple = fmin(ceil(ratio), max_slice_nums);
-
-            auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
-            auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
-            res.grid_size    = best_grid;
-            res.refined_size = refine_size;
-
-            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
-                    __func__, original_width, original_height,
-                    res.overview_size.width, res.overview_size.height,
-                    res.refined_size.width, res.refined_size.height,
-                    res.grid_size.width, res.grid_size.height);
-
-            int width  = refine_size.width;
-            int height = refine_size.height;
-            int grid_x = int(width  / best_grid.width);
-            int grid_y = int(height / best_grid.height);
-            for (int patches_y = 0,                    ic = 0;
-                    patches_y < refine_size.height && ic < best_grid.height;
-                    patches_y += grid_y,              ic += 1) {
-                for (int patches_x = 0,                   jc = 0;
-                        patches_x < refine_size.width && jc < best_grid.width;
-                        patches_x += grid_x,             jc += 1) {
-                    slice_coordinates slice;
-                    slice.x = patches_x;
-                    slice.y = patches_y;
-                    slice.size.width  = grid_x;
-                    slice.size.height = grid_y;
-                    res.slices.push_back(slice);
-                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
-                            __func__, (int)res.slices.size() - 1,
-                            slice.x, slice.y, slice.size.width, slice.size.height);
-                }
-            }
         }
-
-        return res;
     }
 
     static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
@@ -4836,6 +4842,12 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
         || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }
 
+void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n) {
+    if (!ctx) return;
+    if (n < 0) n = 0;
+    ctx->model.hparams.minicpmv_max_slice_nums = n;
+}
+
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
     clip_image_f32 clip_img;
     clip_img.buf.resize(h * w * 3);

diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -91,6 +91,7 @@ bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_i
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
 int clip_is_minicpmv(const struct clip_ctx * ctx);
+void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);

@@ -6,12 +6,15 @@
 #include "ggml.h"
 #include "console.h"
 #include "chat.h"
+#include "clip.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
+#include "mtmd-video.h"
 
 #include <vector>
 #include <limits.h>
 #include <cinttypes>
+#include <cstdlib>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@@ -154,8 +157,8 @@ struct mtmd_cli_context {
         );
     }
 
-    bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
+    bool load_media(const std::string & path) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), path.c_str()));
         if (!bmp.ptr) {
             return false;
         }
@@ -284,7 +287,7 @@ int main(int argc, char ** argv) {
     mtmd_cli_context ctx(params);
     LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
 
-    bool is_single_turn = !params.prompt.empty() && !params.image.empty();
+    bool is_single_turn = !params.prompt.empty() && (!params.image.empty() || !params.video.empty());
 
     int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
 
@@ -308,19 +311,34 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
-            for (size_t i = 0; i < params.image.size(); i++) {
-                params.prompt += mtmd_default_marker();
-            }
-        }
-        common_chat_msg msg;
-        msg.role = "user";
-        msg.content = params.prompt;
+
+        // 1) load all media first
+        size_t n_loaded_media = 0;
         for (const auto & image : params.image) {
             if (!ctx.load_media(image)) {
                 return 1; // error is already printed by libmtmd
             }
+            n_loaded_media += 1;
+        }
+        for (const auto & vpath : params.video) {
+            if (!ctx.load_media(vpath)) {
+                return 1; // error is already printed by libmtmd
+            }
+            n_loaded_media += 1;
         }
+
+        // 2) build prompt content with correct number of markers
+        std::string prompt_content = params.prompt;
+        if (prompt_content.find(mtmd_default_marker()) == std::string::npos) {
+            for (size_t i = 0; i < n_loaded_media; i++) {
+                prompt_content += mtmd_default_marker();
+            }
+        }
+
+        // 3) run
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = prompt_content;
         if (eval_message(ctx, msg)) {
             return 1;
         }
@@ -336,6 +354,9 @@ int main(int argc, char ** argv) {
         if (mtmd_support_audio(ctx.ctx_vision.get())) {
             LOG("\n   /audio <path>    load an audio");
         }
+        if (mtmd_support_vision(ctx.ctx_vision.get())) {
+            LOG("\n   /video <path>     load a video");
+        }
         LOG("\n   /clear           clear the chat history");
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
@@ -367,14 +388,15 @@ int main(int argc, char ** argv) {
             g_is_generating = true;
             bool is_image = line == "/image" || line.find("/image ") == 0;
             bool is_audio = line == "/audio" || line.find("/audio ") == 0;
-            if (is_image || is_audio) {
+            bool is_video = line == "/video" || line.find("/video ") == 0;
+            if (is_image || is_audio || is_video) {
                 if (line.size() < 8) {
                     LOG_ERR("ERR: Missing media filename\n");
                     continue;
                 }
                 std::string media_path = line.substr(7);
                 if (ctx.load_media(media_path)) {
-                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : (is_audio ? "audio" : "video"));
                     content += mtmd_default_marker();
                 }
                 // else, error is already printed by libmtmd