mtmd: improve struct initialization (ggml-org#16981)

ngxson · web-flow · commit 2f0c2db43e2a · 2025-11-05T11:26:37.000+01:00
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -2761,6 +2761,7 @@ struct clip_model_loader {
                     {
                         // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
                         // TODO: verify the image_min_tokens
+                        hparams.n_merge = 1; // the original pixtral does not use patch merging
                         hparams.rope_theta = 10000.0f;
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         hparams.set_limit_image_tokens(8, 1024);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -101,16 +101,17 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_
 }
 
 mtmd_context_params mtmd_context_params_default() {
-    mtmd_context_params params;
-    params.use_gpu = true;
-    params.print_timings = true;
-    params.n_threads = 4;
-    params.verbosity = GGML_LOG_LEVEL_INFO;
-    params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
-    params.media_marker = mtmd_default_marker();
-    params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
-    params.image_min_tokens = -1;
-    params.image_max_tokens = -1;
+    mtmd_context_params params {
+        /* use_gpu           */ true,
+        /* print_timings     */ true,
+        /* n_threads         */ 4,
+        /* verbosity         */ GGML_LOG_LEVEL_INFO,
+        /* image_marker      */ MTMD_DEFAULT_IMAGE_MARKER,
+        /* media_marker      */ mtmd_default_marker(),
+        /* flash_attn_type   */ LLAMA_FLASH_ATTN_TYPE_AUTO,
+        /* image_min_tokens  */ -1,
+        /* image_max_tokens  */ -1,
+    };
     return params;
 }
 
@@ -172,13 +173,13 @@ struct mtmd_context {
             throw std::runtime_error("media_marker must not be empty");
         }
 
-        clip_context_params ctx_clip_params;
-        ctx_clip_params.use_gpu          = ctx_params.use_gpu;
-        ctx_clip_params.verbosity        = ctx_params.verbosity;
-        ctx_clip_params.flash_attn_type  = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
-        // custom image token limits
-        ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens;
-        ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens;
+        clip_context_params ctx_clip_params {
+            /* use_gpu           */ ctx_params.use_gpu,
+            /* verbosity         */ ctx_params.verbosity,
+            /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
+            /* image_min_tokens  */ ctx_params.image_min_tokens,
+            /* image_max_tokens  */ ctx_params.image_max_tokens,
+        };
 
         auto res = clip_init(mmproj_fname, ctx_clip_params);
         ctx_v = res.ctx_v;

Original file line number	Diff line number	Diff line change
`@@ -2761,6 +2761,7 @@ struct clip_model_loader {`
`2761`	`2761`	`{`
`2762`	`2762`	`// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json`
`2763`	`2763`	`// TODO: verify the image_min_tokens`
	`2764`	`+ hparams.n_merge = 1; // the original pixtral does not use patch merging`
`2764`	`2765`	`hparams.rope_theta = 10000.0f;`
`2765`	`2766`	`get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);`
`2766`	`2767`	`hparams.set_limit_image_tokens(8, 1024);`