Skip to content

Commit 955b18e

Browse files
committed
wip lfm2 vision model
1 parent e885445 commit 955b18e

File tree

5 files changed

+91
-2
lines changed

5 files changed

+91
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8251,8 +8251,7 @@ def set_gguf_parameters(self):
82518251
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
82528252

82538253

8254-
@ModelBase.register("Lfm2ForCausalLM")
8255-
@ModelBase.register("LFM2ForCausalLM")
8254+
@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
82568255
class LFM2Model(TextModel):
82578256
model_arch = gguf.MODEL_ARCH.LFM2
82588257

@@ -8287,13 +8286,52 @@ def set_gguf_parameters(self):
82878286
self._add_feed_forward_length()
82888287

82898288
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8289+
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
8290+
if is_vision_tensor:
8291+
# skip vision tensors
8292+
return []
8293+
8294+
name = name.replace("language_model.", "")
8295+
82908296
# conv op requires 2d tensor
82918297
if 'conv.conv' in name:
82928298
data_torch = data_torch.squeeze(1)
82938299

82948300
return [(self.map_tensor_name(name), data_torch)]
82958301

82968302

8303+
@ModelBase.register("Lfm2VlForConditionalGeneration")
8304+
class LFM2VLModel(MmprojModel):
8305+
def __init__(self, *args, **kwargs):
8306+
super().__init__(*args, **kwargs)
8307+
assert self.hparams_vision is not None
8308+
self.hparams_vision["image_size"] = 256
8309+
8310+
def set_gguf_parameters(self):
8311+
super().set_gguf_parameters()
8312+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2)
8313+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
8314+
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2))
8315+
self.gguf_writer.add_vision_use_gelu(True)
8316+
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - 1)
8317+
8318+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8319+
del bid # unused
8320+
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
8321+
8322+
if is_vision_tensor:
8323+
# remove "model." prefix
8324+
name = name.replace("model.vision_tower.", "vision_tower.")
8325+
name = name.replace("model.multi_modal_projector.", "multi_modal_projector.")
8326+
8327+
if "patch_embedding.weight" in name:
8328+
data_torch = data_torch.view(data_torch.shape[0], 3, 16, 16)
8329+
8330+
return [(self.map_tensor_name(name), data_torch)]
8331+
8332+
return [] # skip other tensors
8333+
8334+
82978335
@ModelBase.register("SmallThinkerForCausalLM")
82988336
class SmallThinkerModel(TextModel):
82998337
model_arch = gguf.MODEL_ARCH.SMALLTHINKER

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2832,6 +2832,7 @@ class VisionProjectorType:
28322832
QWEN2A = "qwen2a" # audio
28332833
QWEN25O = "qwen2.5o" # omni
28342834
VOXTRAL = "voxtral"
2835+
LFM2 = "lfm2"
28352836

28362837

28372838
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,7 @@ class TensorNameMap:
12721272

12731273
MODEL_TENSOR.V_MM_INP_NORM: (
12741274
"multi_modal_projector.norm",
1275+
"multi_modal_projector.layer_norm",
12751276
"pre_mm_projector_norm",
12761277
),
12771278

tools/mtmd/clip-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
8383
#define TN_IMAGE_NEWLINE "model.image_newline"
8484
#define TN_MM_INP_NORM "mm.input_norm.weight"
85+
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
8586
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
8687
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
8788
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
@@ -133,6 +134,7 @@ enum projector_type {
133134
PROJECTOR_TYPE_QWEN2A,
134135
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
135136
PROJECTOR_TYPE_VOXTRAL,
137+
PROJECTOR_TYPE_LFM2,
136138
PROJECTOR_TYPE_UNKNOWN,
137139
};
138140

@@ -153,6 +155,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
153155
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
154156
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
155157
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
158+
{ PROJECTOR_TYPE_LFM2, "lfm2"},
156159
};
157160

158161
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ struct clip_model {
265265

266266
// LLaVA projection
267267
ggml_tensor * mm_input_norm_w = nullptr;
268+
ggml_tensor * mm_input_norm_b = nullptr;
268269
ggml_tensor * mm_0_w = nullptr;
269270
ggml_tensor * mm_0_b = nullptr;
270271
ggml_tensor * mm_2_w = nullptr;
@@ -542,6 +543,36 @@ struct clip_graph {
542543
bsz);
543544

544545
cur = ggml_mul_mat(ctx0, model.projection, cur);
546+
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
547+
const int scale_factor = model.hparams.proj_scale_factor;
548+
const int n_embd = cur->ne[0];
549+
const int seq = cur->ne[1];
550+
const int bsz = 1; // batch size, always 1 for now since we don't support batching
551+
const int height = std::sqrt(seq);
552+
const int width = std::sqrt(seq);
553+
GGML_ASSERT(scale_factor != 0);
554+
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
555+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
556+
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
557+
n_embd * scale_factor * scale_factor,
558+
height / scale_factor,
559+
width / scale_factor,
560+
bsz);
561+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
562+
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
563+
n_embd * scale_factor * scale_factor,
564+
seq / (scale_factor * scale_factor),
565+
bsz);
566+
567+
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
568+
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
569+
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
570+
571+
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
572+
cur = ggml_add(ctx0, cur, model.mm_1_b);
573+
cur = ggml_gelu(ctx0, cur);
574+
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
575+
cur = ggml_add(ctx0, cur, model.mm_2_b);
545576
} else {
546577
GGML_ABORT("SigLIP: Unsupported projector type");
547578
}
@@ -1966,6 +1997,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
19661997
switch (ctx->proj_type()) {
19671998
case PROJECTOR_TYPE_GEMMA3:
19681999
case PROJECTOR_TYPE_IDEFICS3:
2000+
case PROJECTOR_TYPE_LFM2:
19692001
{
19702002
res = graph.build_siglip();
19712003
} break;
@@ -2230,6 +2262,7 @@ struct clip_model_loader {
22302262
}
22312263
} break;
22322264
case PROJECTOR_TYPE_IDEFICS3:
2265+
case PROJECTOR_TYPE_LFM2:
22332266
case PROJECTOR_TYPE_INTERNVL:
22342267
{
22352268
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
@@ -2533,6 +2566,15 @@ struct clip_model_loader {
25332566
{
25342567
model.projection = get_tensor(TN_MM_PROJECTOR);
25352568
} break;
2569+
case PROJECTOR_TYPE_LFM2:
2570+
{
2571+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
2572+
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
2573+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
2574+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
2575+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2576+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
2577+
} break;
25362578
case PROJECTOR_TYPE_PIXTRAL:
25372579
{
25382580
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
@@ -3591,6 +3633,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
35913633
n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
35923634
} break;
35933635
case PROJECTOR_TYPE_IDEFICS3:
3636+
case PROJECTOR_TYPE_LFM2:
35943637
case PROJECTOR_TYPE_INTERNVL:
35953638
{
35963639
// both W and H are divided by proj_scale_factor
@@ -4034,6 +4077,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
40344077
case PROJECTOR_TYPE_INTERNVL:
40354078
case PROJECTOR_TYPE_QWEN2A:
40364079
case PROJECTOR_TYPE_ULTRAVOX:
4080+
case PROJECTOR_TYPE_LFM2:
40374081
case PROJECTOR_TYPE_VOXTRAL:
40384082
{
40394083
// do nothing
@@ -4135,6 +4179,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
41354179
return ctx->model.mm_model_proj->ne[1];
41364180
case PROJECTOR_TYPE_QWEN2A:
41374181
return ctx->model.mm_fc_w->ne[1];
4182+
case PROJECTOR_TYPE_LFM2:
4183+
return ctx->model.mm_2_w->ne[1];
41384184
default:
41394185
GGML_ABORT("Unknown projector type");
41404186
}

0 commit comments

Comments
 (0)