Skip to content

Commit c53c566

Browse files
committed
improve hparams
1 parent 66d5c43 commit c53c566

File tree

1 file changed

+36
-37
lines changed

1 file changed

+36
-37
lines changed

tools/mtmd/clip.cpp

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ struct clip_hparams {
174174
int32_t image_longest_edge = 0;
175175
int32_t image_min_pixels = 0;
176176
int32_t image_max_pixels = 0;
177-
int32_t proj_scale_factor = 0;
177+
int32_t proj_scale_factor = 0; // = (spatial_merge_size)^2
178178

179179
float image_mean[3];
180180
float image_std[3];
@@ -196,7 +196,6 @@ struct clip_hparams {
196196
std::unordered_set<int32_t> vision_feature_layer;
197197
int32_t attn_window_size = 0;
198198
int32_t n_wa_pattern = 0;
199-
int32_t spatial_merge_size = 0;
200199

201200
// audio
202201
int32_t n_mel_bins = 0; // whisper preprocessor
@@ -209,9 +208,16 @@ struct clip_hparams {
209208

210209
// used by LFM2 and KIMI-VL
211210
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
212-
const int total_factor = patch_size * proj_scale_factor;
213-
image_min_pixels = n_tokens_min * total_factor * total_factor;
214-
image_max_pixels = n_tokens_max * total_factor * total_factor;
211+
const int patch_area = patch_size * patch_size * proj_scale_factor;
212+
image_min_pixels = n_tokens_min * patch_area;
213+
image_max_pixels = n_tokens_max * patch_area;
214+
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
215+
}
216+
217+
void set_warmup_n_tokens(int n_tokens) {
218+
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
219+
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
220+
warmup_image_size = n_tok_per_side * patch_size * static_cast<int>(std::sqrt(proj_scale_factor));
215221
}
216222
};
217223

@@ -593,7 +599,7 @@ struct clip_graph {
593599
}
594600

595601
ggml_cgraph * build_pixtral() {
596-
const int n_merge = hparams.spatial_merge_size;
602+
const int n_merge = hparams.proj_scale_factor;
597603

598604
// 2D input positions
599605
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -619,7 +625,7 @@ struct clip_graph {
619625
// mistral small 3.1 patch merger
620626
// ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
621627
if (model.mm_patch_merger_w) {
622-
GGML_ASSERT(hparams.spatial_merge_size > 0);
628+
GGML_ASSERT(hparams.proj_scale_factor > 0);
623629

624630
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
625631

@@ -935,7 +941,7 @@ struct clip_graph {
935941

936942
// deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
937943
ggml_tensor * deepstack_features = nullptr;
938-
const int merge_factor = hparams.spatial_merge_size > 0 ? hparams.spatial_merge_size * hparams.spatial_merge_size : 4; // default 2x2=4 for qwen3vl
944+
const int merge_factor = hparams.proj_scale_factor > 0 ? hparams.proj_scale_factor * hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl
939945

940946
// loop over layers
941947
for (int il = 0; il < n_layer; il++) {
@@ -2700,25 +2706,32 @@ struct clip_model_loader {
27002706
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
27012707
} break;
27022708
case PROJECTOR_TYPE_IDEFICS3:
2709+
{
2710+
hparams.set_limit_image_tokens(8, 1024);
2711+
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
2712+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2713+
} break;
27032714
case PROJECTOR_TYPE_LFM2:
27042715
{
2705-
hparams.set_limit_image_tokens(64, 1024);
2716+
hparams.set_limit_image_tokens(8, 256);
27062717
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
27072718
} break;
27082719
case PROJECTOR_TYPE_PIXTRAL:
27092720
case PROJECTOR_TYPE_LIGHTONOCR:
27102721
{
27112722
hparams.rope_theta = 10000.0f;
2712-
hparams.warmup_image_size = hparams.patch_size * 8;
2713-
hparams.set_limit_image_tokens(64, 1024);
2714-
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
2723+
int spatial_merge = 2;
2724+
get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
2725+
hparams.proj_scale_factor = spatial_merge * spatial_merge;
2726+
hparams.set_limit_image_tokens(8, 1024);
2727+
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
27152728
} break;
27162729
case PROJECTOR_TYPE_KIMIVL:
27172730
{
27182731
hparams.rope_theta = 10000.0f;
2719-
hparams.warmup_image_size = hparams.patch_size * 8;
2720-
hparams.set_limit_image_tokens(64, 1024);
27212732
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2733+
hparams.set_limit_image_tokens(8, 1024);
2734+
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
27222735
} break;
27232736
case PROJECTOR_TYPE_GEMMA3:
27242737
{
@@ -2729,29 +2742,15 @@ struct clip_model_loader {
27292742
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
27302743
} break;
27312744
case PROJECTOR_TYPE_QWEN2VL:
2732-
{
2733-
// max image size = sqrt(max_pixels) = 3584
2734-
// ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
2735-
// however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
2736-
// ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
2737-
hparams.image_size = 1024;
2738-
hparams.warmup_image_size = hparams.patch_size * 8;
2739-
} break;
27402745
case PROJECTOR_TYPE_QWEN25VL:
2741-
{
2742-
// max image size = sqrt(max_pixels)
2743-
// https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
2744-
// however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
2745-
// ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
2746-
hparams.image_size = 1024;
2747-
hparams.warmup_image_size = hparams.patch_size * 8;
2748-
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
2749-
} break;
27502746
case PROJECTOR_TYPE_QWEN3VL:
27512747
{
2752-
hparams.image_size = 1024; // still need this?
2753-
hparams.warmup_image_size = hparams.patch_size * 8;
2754-
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
2748+
int spatial_merge = 2;
2749+
get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
2750+
hparams.proj_scale_factor = spatial_merge * spatial_merge;
2751+
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
2752+
hparams.set_limit_image_tokens(8, 1024);
2753+
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
27552754
} break;
27562755
case PROJECTOR_TYPE_LLAMA4:
27572756
{
@@ -2791,8 +2790,8 @@ struct clip_model_loader {
27912790
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
27922791
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
27932792
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
2794-
if (hparams.spatial_merge_size > 0) {
2795-
LOG_INF("%s: spatial_merge_size: %d\n", __func__, hparams.spatial_merge_size);
2793+
if (hparams.proj_scale_factor > 0) {
2794+
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
27962795
}
27972796
} else if (is_audio) {
27982797
LOG_INF("\n--- audio hparams ---\n");
@@ -4310,7 +4309,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43104309
case PROJECTOR_TYPE_LIGHTONOCR:
43114310
{
43124311
// dynamic size
4313-
int n_merge = params.spatial_merge_size;
4312+
int n_merge = params.proj_scale_factor;
43144313
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
43154314
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
43164315
if (ctx->model.token_embd_img_break) {

0 commit comments

Comments
 (0)