From e81b87fea630288bf0a83751a1fb4d6e63fd6a2e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 13 Aug 2025 13:52:51 +0200 Subject: [PATCH 1/7] wip lfm2 vision model --- convert_hf_to_gguf.py | 42 +++++++++++++++++++++++++++++-- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/tensor_mapping.py | 1 + tools/mtmd/clip-impl.h | 3 +++ tools/mtmd/clip.cpp | 46 ++++++++++++++++++++++++++++++++++ 5 files changed, 91 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 444e2cbdfbb6a..999eac86af4c6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8251,8 +8251,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096)) -@ModelBase.register("Lfm2ForCausalLM") -@ModelBase.register("LFM2ForCausalLM") +@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") class LFM2Model(TextModel): model_arch = gguf.MODEL_ARCH.LFM2 @@ -8287,6 +8286,13 @@ def set_gguf_parameters(self): self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + if is_vision_tensor: + # skip vision tensors + return [] + + name = name.replace("language_model.", "") + # conv op requires 2d tensor if 'conv.conv' in name: data_torch = data_torch.squeeze(1) @@ -8294,6 +8300,38 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("Lfm2VlForConditionalGeneration") +class LFM2VLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 256 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - 1) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + + if is_vision_tensor: + # remove "model." prefix + name = name.replace("model.vision_tower.", "vision_tower.") + name = name.replace("model.multi_modal_projector.", "multi_modal_projector.") + + if "patch_embedding.weight" in name: + data_torch = data_torch.view(data_torch.shape[0], 3, 16, 16) + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + @ModelBase.register("SmallThinkerForCausalLM") class SmallThinkerModel(TextModel): model_arch = gguf.MODEL_ARCH.SMALLTHINKER diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 911eea504a19e..41804f3a2bb1a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2832,6 +2832,7 @@ class VisionProjectorType: QWEN2A = "qwen2a" # audio QWEN25O = "qwen2.5o" # omni VOXTRAL = "voxtral" + LFM2 = "lfm2" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index c5c27980905de..87edaa3232ccc 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1272,6 +1272,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_NORM: ( "multi_modal_projector.norm", + "multi_modal_projector.layer_norm", "pre_mm_projector_norm", ), diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index f1eb633369376..706ed2e3b5e21 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -82,6 +82,7 @@ #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" #define TN_MM_INP_NORM "mm.input_norm.weight" +#define TN_MM_INP_NORM_B "mm.input_norm.bias" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 @@ -133,6 +134,7 @@ enum projector_type { PROJECTOR_TYPE_QWEN2A, PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, + PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_UNKNOWN, }; @@ -153,6 +155,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, + { PROJECTOR_TYPE_LFM2, "lfm2"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fdaf9738e88cb..a55503a250741 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -265,6 +265,7 @@ struct clip_model { // LLaVA projection ggml_tensor * mm_input_norm_w = nullptr; + ggml_tensor * mm_input_norm_b = nullptr; ggml_tensor * mm_0_w = nullptr; ggml_tensor * mm_0_b = nullptr; ggml_tensor * mm_2_w = nullptr; @@ -542,6 +543,36 @@ struct clip_graph { bsz); cur = ggml_mul_mat(ctx0, model.projection, cur); + } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { + const int scale_factor = model.hparams.proj_scale_factor; + const int n_embd = cur->ne[0]; + const int seq = cur->ne[1]; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = std::sqrt(seq); + const int width = std::sqrt(seq); + GGML_ASSERT(scale_factor != 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + seq / (scale_factor * scale_factor), + bsz); + + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = ggml_add(ctx0, cur, model.mm_1_b); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + cur = ggml_add(ctx0, cur, model.mm_2_b); } else { GGML_ABORT("SigLIP: Unsupported projector type"); } @@ -1966,6 +1997,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 switch (ctx->proj_type()) { case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_LFM2: { res = graph.build_siglip(); } break; @@ -2230,6 +2262,7 @@ struct clip_model_loader { } } break; case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_INTERNVL: { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); @@ -2533,6 +2566,15 @@ struct clip_model_loader { { model.projection = get_tensor(TN_MM_PROJECTOR); } break; + case PROJECTOR_TYPE_LFM2: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); + model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; case PROJECTOR_TYPE_PIXTRAL: { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); @@ -3591,6 +3633,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool; } break; case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_INTERNVL: { // both W and H are divided by proj_scale_factor @@ -4034,6 +4077,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_VOXTRAL: { // do nothing @@ -4135,6 +4179,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_QWEN2A: return ctx->model.mm_fc_w->ne[1]; + case PROJECTOR_TYPE_LFM2: + return ctx->model.mm_2_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } From d1c170508752eacba2b610fa96abef60e0b6b96c Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 14 Aug 2025 16:00:02 +0200 Subject: [PATCH 2/7] Fix conv weight --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 999eac86af4c6..be8474fc5c18c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8290,7 +8290,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if is_vision_tensor: # skip vision tensors return [] - + name = name.replace("language_model.", "") # conv op requires 2d tensor @@ -8325,7 +8325,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("model.multi_modal_projector.", "multi_modal_projector.") if "patch_embedding.weight" in name: - data_torch = data_torch.view(data_torch.shape[0], 3, 16, 16) + data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) return [(self.map_tensor_name(name), data_torch)] From 8f6bce977897502fa23c92ce8da1f3fbf18a529f Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 14 Aug 2025 23:31:14 +0200 Subject: [PATCH 3/7] Implement dynamic resolution --- tools/mtmd/clip.cpp | 109 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 92 insertions(+), 17 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a55503a250741..51e6955d124f4 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -489,11 +489,17 @@ struct clip_graph { ggml_cgraph * build_siglip() { ggml_tensor * inp = build_inp(); + + ggml_tensor * learned_pos_embd = model.position_embeddings; + if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { + learned_pos_embd = resize_position_embeddings(); + } + ggml_tensor * cur = build_vit( inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, - model.position_embeddings, + learned_pos_embd, nullptr); if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) { @@ -544,26 +550,35 @@ struct clip_graph { cur = ggml_mul_mat(ctx0, model.projection, cur); } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { + // pixel unshuffle block const int scale_factor = model.hparams.proj_scale_factor; + GGML_ASSERT(scale_factor > 1); + const int n_embd = cur->ne[0]; - const int seq = cur->ne[1]; - const int bsz = 1; // batch size, always 1 for now since we don't support batching - const int height = std::sqrt(seq); - const int width = std::sqrt(seq); - GGML_ASSERT(scale_factor != 0); - cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); + int width = img.nx / patch_size; + int height = img.ny / patch_size; + + // pad width and height to factor + const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; + const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; + cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); + if (pad_width || pad_height) { + cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); + width += pad_width; + height += pad_height; + } + + // unshuffle h + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), n_embd * scale_factor, width / scale_factor, height); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), - n_embd * scale_factor * scale_factor, - height / scale_factor, - width / scale_factor, - bsz); + + // unshuffle w + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), - n_embd * scale_factor * scale_factor, - seq / (scale_factor * scale_factor), - bsz); + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), cur->ne[0], cur->ne[1] * cur->ne[2]); + + // projection cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); cur = ggml_add(ctx0, cur, model.mm_input_norm_b); @@ -1591,6 +1606,26 @@ struct clip_graph { } } + // siglip2 naflex + ggml_tensor * resize_position_embeddings() { + ggml_tensor * pos_embd = model.position_embeddings; + const int height = img.ny / patch_size; + const int width = img.nx / patch_size; + + if (!pos_embd || height * width == pos_embd->ne[1]) { + return pos_embd; + } + + const int n_pos_embd = std::sqrt(pos_embd->ne[1]); + pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_pos_embd, n_pos_embd); // -> (n_embd, n_pos_embd, n_pos_embd) + pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_pos_embd, n_pos_embd, n_embd) + pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, 1); // -> (width, height, n_embd) + pos_embd = ggml_reshape_2d(ctx0, pos_embd, height * width, n_embd); // -> (height * width, n_embd) + pos_embd = ggml_transpose(ctx0, pos_embd); // -> (n_embd, height * width) + + return pos_embd; + } + // build vision transformer (ViT) cgraph // this function should cover most of the models // if your model has specific features, you should probably duplicate this function @@ -3470,6 +3505,43 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->grid_y = inst.grid_size.height; return true; + } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { + GGML_ASSERT(params.proj_scale_factor); + + // smart resize + const int width = img->nx; + const int height = img->ny; + const int total_factor = params.patch_size * params.proj_scale_factor; + constexpr int min_image_tokens = 64; + constexpr int max_image_tokens = 256; + const float min_pixels = min_image_tokens * total_factor * total_factor; + const float max_pixels = max_image_tokens * total_factor * total_factor; + + auto round_by_factor = [f = total_factor](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; + auto ceil_by_factor = [f = total_factor](float x) { return static_cast(std::ceilf(x / static_cast(f))) * f; }; + auto floor_by_factor = [f = total_factor](float x) { return static_cast(std::floorf(x / static_cast(f))) * f; }; + + int h_bar = std::max(total_factor, round_by_factor(height)); + int w_bar = std::max(total_factor, round_by_factor(width)); + + if (h_bar * w_bar > max_pixels) { + const auto beta = std::sqrt((height * width) / max_pixels); + h_bar = std::max(total_factor, floor_by_factor(height / beta)); + w_bar = std::max(total_factor, floor_by_factor(width / beta)); + } else if (h_bar * w_bar < min_pixels) { + const auto beta = std::sqrt(min_pixels / (height * width)); + h_bar = ceil_by_factor(height * beta); + w_bar = ceil_by_factor(width * beta); + } + + const std::array pad_color = {122, 116, 104}; + + clip_image_u8 resized_img; + image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + return true; } // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) @@ -3633,7 +3705,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool; } break; case PROJECTOR_TYPE_IDEFICS3: - case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_INTERNVL: { // both W and H are divided by proj_scale_factor @@ -3673,6 +3744,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches_sq /= 2; } } break; + case PROJECTOR_TYPE_LFM2: + { + n_patches_sq = (img->nx / (params.patch_size * params.proj_scale_factor)) * (img->ny / (params.patch_size * params.proj_scale_factor)); + } break; default: GGML_ABORT("unsupported projector type"); } From 8f20c14c40b0f50fb7a69c9f21e9aa33463e0620 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 15 Aug 2025 16:38:58 +0200 Subject: [PATCH 4/7] Fix cuda --- tools/mtmd/clip.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 51e6955d124f4..ae4e0be71dbf1 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1622,6 +1622,7 @@ struct clip_graph { pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, 1); // -> (width, height, n_embd) pos_embd = ggml_reshape_2d(ctx0, pos_embd, height * width, n_embd); // -> (height * width, n_embd) pos_embd = ggml_transpose(ctx0, pos_embd); // -> (n_embd, height * width) + pos_embd = ggml_cont(ctx0, pos_embd); return pos_embd; } From d39cc2ecb6f66be8113e0eb24a0abd7c88d3d441 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 15 Aug 2025 17:06:28 +0200 Subject: [PATCH 5/7] support LFM2-VL-450M --- convert_hf_to_gguf.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index be8474fc5c18c..9541d27fae1a0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8305,15 +8305,18 @@ class LFM2VLModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hparams_vision is not None + # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility self.hparams_vision["image_size"] = 256 def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) + self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor")) self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - 1) + # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 + vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer") + 1) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused From b47e68ecdc49162aac9307ade1a9952b0befe87d Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 15 Aug 2025 18:42:54 +0200 Subject: [PATCH 6/7] happy CI --- convert_hf_to_gguf.py | 4 ++-- tools/mtmd/clip.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9541d27fae1a0..bd21e55f4a90c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8312,10 +8312,10 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) - self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor")) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) self.gguf_writer.add_vision_use_gelu(True) # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 - vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer") + 1) + vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1) self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index ae4e0be71dbf1..754cb4f1ab39b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3519,8 +3519,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const float max_pixels = max_image_tokens * total_factor * total_factor; auto round_by_factor = [f = total_factor](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; - auto ceil_by_factor = [f = total_factor](float x) { return static_cast(std::ceilf(x / static_cast(f))) * f; }; - auto floor_by_factor = [f = total_factor](float x) { return static_cast(std::floorf(x / static_cast(f))) * f; }; + auto ceil_by_factor = [f = total_factor](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + auto floor_by_factor = [f = total_factor](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; int h_bar = std::max(total_factor, round_by_factor(height)); int w_bar = std::max(total_factor, round_by_factor(width)); From 6fce2d590c8c87af61c622241625e419f9241fad Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Sat, 16 Aug 2025 10:31:42 -0700 Subject: [PATCH 7/7] Remove extra `ggml_conv` and put others into the right place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- tools/mtmd/clip.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 754cb4f1ab39b..c27f8ebbd9912 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -569,14 +569,14 @@ struct clip_graph { } // unshuffle h - cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), n_embd * scale_factor, width / scale_factor, height); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); // unshuffle w - cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); // projection cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm