From dcc53b1c9ca92817f1fbc88705bcd17a09720955 Mon Sep 17 00:00:00 2001 From: City <125218114+city96@users.noreply.github.com> Date: Sun, 11 May 2025 01:21:43 +0200 Subject: [PATCH 1/3] Support InternVL 3 38B and 78B mmproj --- gguf-py/gguf/constants.py | 6 ++++++ gguf-py/gguf/tensor_mapping.py | 8 ++++++++ tools/mtmd/clip-impl.h | 2 ++ tools/mtmd/clip.cpp | 15 +++++++++++++++ 4 files changed, 31 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ae5ce71aef939..0e6226b900db9 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -483,7 +483,9 @@ class MODEL_TENSOR(IntEnum): V_ENC_EMBD_PATCH = auto() V_ENC_EMBD_POS = auto() V_ENC_ATTN_Q = auto() + V_ENC_ATTN_Q_NORM = auto() V_ENC_ATTN_K = auto() + V_ENC_ATTN_K_NORM = auto() V_ENC_ATTN_V = auto() V_ENC_INPUT_NORM = auto() V_ENC_OUTPUT = auto() @@ -742,7 +744,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", + MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm", MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", + MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm", MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v", MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1", MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out", @@ -782,7 +786,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_EMBD_PATCH, MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_Q_NORM, MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_K_NORM, MODEL_TENSOR.V_ENC_ATTN_V, MODEL_TENSOR.V_ENC_INPUT_NORM, MODEL_TENSOR.V_ENC_OUTPUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index bf7ec325772ce..ecf21b2b44142 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -938,6 +938,10 @@ class TensorNameMap: "visual.blocks.{bid}.attn.q", # qwen2vl, generated ), + MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL + ), + MODEL_TENSOR.V_ENC_ATTN_K: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", "vpm.encoder.layers.{bid}.self_attn.k_proj", @@ -946,6 +950,10 @@ class TensorNameMap: "visual.blocks.{bid}.attn.k", # qwen2vl, generated ), + MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL + ), + MODEL_TENSOR.V_ENC_ATTN_V: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", "vpm.encoder.layers.{bid}.self_attn.v_proj", diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index d7b788bf979c5..d780ce179e7d9 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -61,6 +61,8 @@ #define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm #define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale #define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale +#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" +#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" #define TN_LLAVA_PROJ "mm.%d.%s" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0ebe81b07c1ef..9ac94bf8d0911 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -205,6 +205,9 @@ struct clip_layer { ggml_tensor * o_w = nullptr; ggml_tensor * o_b = nullptr; + ggml_tensor * k_norm = nullptr; + ggml_tensor * q_norm = nullptr; + // layernorm 1 ggml_tensor * ln_1_w = nullptr; ggml_tensor * ln_1_b = nullptr; @@ -1363,6 +1366,16 @@ struct clip_graph { Vcur = ggml_add(ctx0, Vcur, layer.v_b); } + if (layer.q_norm) { + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm", il); + } + + if (layer.k_norm) { + Kcur = build_norm(Kcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm", il); + } + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); @@ -1992,6 +2005,8 @@ struct clip_model_loader { layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false); layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias + layer.k_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false); + layer.q_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false); layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false); layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false); From a505b92f961f81520e2c7a70a4fc91534c41e216 Mon Sep 17 00:00:00 2001 From: City <125218114+city96@users.noreply.github.com> Date: Sun, 11 May 2025 01:33:51 +0200 Subject: [PATCH 2/3] Swap norms in clip.cpp --- tools/mtmd/clip-impl.h | 2 +- tools/mtmd/clip.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index d780ce179e7d9..7729d6ac6ac19 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -61,8 +61,8 @@ #define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm #define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale #define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale -#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" #define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" +#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" #define TN_LLAVA_PROJ "mm.%d.%s" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9ac94bf8d0911..73b482f607b7b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1372,7 +1372,7 @@ struct clip_graph { } if (layer.k_norm) { - Kcur = build_norm(Kcur, layer.q_norm, NULL, norm_t, eps, il); + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); cb(Kcur, "Kcur_norm", il); } @@ -2005,8 +2005,8 @@ struct clip_model_loader { layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false); layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias - layer.k_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false); - layer.q_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false); + layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false); + layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false); layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false); layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false); From 814734e1522f075045dc9ed0ad8c7cedd5eabfe4 Mon Sep 17 00:00:00 2001 From: City <125218114+city96@users.noreply.github.com> Date: Sun, 11 May 2025 01:40:50 +0200 Subject: [PATCH 3/3] Group variables together --- tools/mtmd/clip-impl.h | 4 ++-- tools/mtmd/clip.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 7729d6ac6ac19..23036ba72f1c1 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -53,6 +53,8 @@ #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" +#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" #define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" #define TN_FFN_UP "%s.blk.%d.ffn_up.%s" @@ -61,8 +63,6 @@ #define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm #define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale #define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale -#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" -#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" #define TN_LLAVA_PROJ "mm.%d.%s" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 73b482f607b7b..735dfe7f78029 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2001,12 +2001,12 @@ struct clip_model_loader { layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight")); layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight")); layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight")); + layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false); + layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false); layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false); layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false); layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias - layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false); - layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false); layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false); layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);