Skip to content

Commit e309f16

Browse files
committed
convert : internvl support
1 parent d891942 commit e309f16

File tree

3 files changed

+80
-1
lines changed

3 files changed

+80
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,11 @@ def load_hparams(dir_model: Path):
426426
logger.warning(f"Failed to load model config from {dir_model}: {e}")
427427
logger.warning("Trying to load config.json instead")
428428
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
429-
return json.load(f)
429+
config = json.load(f)
430+
if "llm_config" in config:
431+
# rename for InternVL
432+
config["text_config"] = config["llm_config"]
433+
return config
430434

431435
@classmethod
432436
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -2606,6 +2610,11 @@ def set_gguf_parameters(self):
26062610
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
26072611
if self.hf_arch == "Qwen2Model":
26082612
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
2613+
if "language_model." in name:
2614+
name = name.replace("language_model.", "") # for InternVL
2615+
if name.startswith("mlp") or name.startswith("vision_model"):
2616+
# skip visual tensors
2617+
return []
26092618
yield from super().modify_tensors(data_torch, name, bid)
26102619

26112620

@@ -2709,6 +2718,57 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27092718
return [] # skip other tensors
27102719

27112720

2721+
@ModelBase.register("InternVisionModel")
2722+
class InternVisionModel(VisionModel):
2723+
def set_gguf_parameters(self):
2724+
super().set_gguf_parameters()
2725+
hparams = self.hparams
2726+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
2727+
# hidden_act
2728+
if hparams["hidden_act"] == "silu":
2729+
self.gguf_writer.add_vision_use_silu(True)
2730+
elif hparams["hidden_act"] == "gelu":
2731+
self.gguf_writer.add_vision_use_gelu(True)
2732+
else:
2733+
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
2734+
2735+
def tensor_force_quant(self, name, new_name, bid, n_dims):
2736+
del bid, name, n_dims # unused
2737+
if ".patch_embd." in new_name:
2738+
return gguf.GGMLQuantizationType.F16
2739+
if ".position_embd." in new_name:
2740+
return gguf.GGMLQuantizationType.F32
2741+
return False
2742+
2743+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2744+
del bid # unused
2745+
if name.startswith("vision_model") or name.startswith("mlp"):
2746+
# process visual tensors
2747+
# correct name
2748+
if name.startswith("vision_model"):
2749+
name = "vision_tower." + name
2750+
if ".ls" in name and not name.endswith(".weight"):
2751+
name += ".weight"
2752+
# split QKV tensors if needed
2753+
if ".qkv." in name:
2754+
if data_torch.ndim == 2: # weight
2755+
c3, _ = data_torch.shape
2756+
else: # bias
2757+
c3 = data_torch.shape[0]
2758+
assert c3 % 3 == 0
2759+
c = c3 // 3
2760+
wq = data_torch[:c]
2761+
wk = data_torch[c: c * 2]
2762+
wv = data_torch[c * 2:]
2763+
return [
2764+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
2765+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
2766+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
2767+
]
2768+
return [(self.map_tensor_name(name), data_torch)]
2769+
return [] # skip other tensors
2770+
2771+
27122772
@ModelBase.register("WavTokenizerDec")
27132773
class WavTokenizerDecModel(TextModel):
27142774
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC

gguf-py/gguf/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,8 @@ class MODEL_TENSOR(IntEnum):
491491
V_ENC_FFN_UP = auto()
492492
V_ENC_FFN_GATE = auto()
493493
V_ENC_FFN_DOWN = auto()
494+
V_LAYER_SCALE_1 = auto()
495+
V_LAYER_SCALE_2 = auto()
494496
V_PRE_NORM = auto()
495497
V_POST_NORM = auto()
496498
V_MM_INP_NORM = auto()
@@ -748,6 +750,8 @@ class MODEL_TENSOR(IntEnum):
748750
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
749751
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
750752
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
753+
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
754+
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
751755
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
752756
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
753757
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
@@ -786,6 +790,8 @@ class MODEL_TENSOR(IntEnum):
786790
MODEL_TENSOR.V_ENC_FFN_UP,
787791
MODEL_TENSOR.V_ENC_FFN_GATE,
788792
MODEL_TENSOR.V_ENC_FFN_DOWN,
793+
MODEL_TENSOR.V_LAYER_SCALE_1,
794+
MODEL_TENSOR.V_LAYER_SCALE_2,
789795
MODEL_TENSOR.V_PRE_NORM,
790796
MODEL_TENSOR.V_POST_NORM,
791797
MODEL_TENSOR.V_MM_INP_PROJ,
@@ -2167,6 +2173,7 @@ class VisionProjectorType:
21672173
PIXTRAL = "pixtral"
21682174
QWEN2VL = "qwen2vl_merger"
21692175
QWEN25VL = "qwen2.5vl_merger"
2176+
INTERNVL = "internvl"
21702177

21712178

21722179
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,7 @@ class TensorNameMap:
905905

906906
MODEL_TENSOR.V_MMPROJ_MLP: (
907907
"model.mm_projector.mlp.mlp.{bid}",
908+
"mlp1.{bid}", # InternVL
908909
),
909910

910911
MODEL_TENSOR.V_MMPROJ_PEG: (
@@ -955,6 +956,7 @@ class TensorNameMap:
955956

956957
MODEL_TENSOR.V_ENC_INPUT_NORM: (
957958
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
959+
"vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
958960
"vpm.encoder.layers.{bid}.layer_norm1",
959961
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
960962
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
@@ -963,6 +965,7 @@ class TensorNameMap:
963965

964966
MODEL_TENSOR.V_ENC_OUTPUT: (
965967
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
968+
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
966969
"vpm.encoder.layers.{bid}.self_attn.out_proj",
967970
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
968971
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
@@ -971,6 +974,7 @@ class TensorNameMap:
971974

972975
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
973976
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
977+
"vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
974978
"vpm.encoder.layers.{bid}.layer_norm2",
975979
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
976980
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
@@ -1000,6 +1004,14 @@ class TensorNameMap:
10001004
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
10011005
),
10021006

1007+
MODEL_TENSOR.V_LAYER_SCALE_1: (
1008+
"vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
1009+
),
1010+
1011+
MODEL_TENSOR.V_LAYER_SCALE_2: (
1012+
"vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
1013+
),
1014+
10031015
MODEL_TENSOR.V_PRE_NORM: (
10041016
"vision_tower.vision_model.pre_layrnorm",
10051017
"vision_tower.ln_pre", # pixtral

0 commit comments

Comments
 (0)