mtmd: pad mask for qwen2.5vl

ngxson · ngxson · commit c770cf465968 · 2025-11-02T23:54:17.000+01:00
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -761,6 +761,14 @@ struct clip_graph {
             ggml_set_name(window_mask, "window_mask");
             ggml_set_input(window_mask);
 
+            // if flash attn is used, we need to pad the mask
+            if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+                int padded_nrow = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD);
+                window_mask = ggml_pad(ctx0, window_mask,
+                    0, padded_nrow - window_mask->ne[0], 0, 0);
+                window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+            }
+
             // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
             GGML_ASSERT(batch_size == 1);
             inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);