Skip to content

Commit b065bf3

Browse files
authored
Merge pull request #21 from yairpatch/master
Adding support for Qwen3-VL by @yairpatch
2 parents 2dd5641 + a314328 commit b065bf3

File tree

16 files changed

+586
-56
lines changed

16 files changed

+586
-56
lines changed

convert_hf_to_gguf.py

Lines changed: 197 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3538,6 +3538,144 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
35383538
return super().modify_tensors(data_torch, name, bid)
35393539

35403540

3541+
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
3542+
class Qwen3VLMoeVisionModel(MmprojModel):
3543+
def __init__(self, *args, **kwargs):
3544+
super().__init__(*args, **kwargs)
3545+
assert self.has_vision_encoder
3546+
assert self.hparams_vision is not None
3547+
3548+
# Compute image_size if not present
3549+
if "image_size" not in self.hparams_vision:
3550+
# For Qwen3VLMoe, compute from num_position_embeddings
3551+
num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
3552+
patch_size = self.hparams_vision.get("patch_size", 16)
3553+
# num_position_embeddings = (image_size / patch_size) ** 2
3554+
# So image_size = sqrt(num_position_embeddings) * patch_size
3555+
import math
3556+
image_size = int(math.sqrt(num_pos) * patch_size)
3557+
self.hparams_vision["image_size"] = image_size
3558+
3559+
# Rename config values for compatibility
3560+
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
3561+
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
3562+
3563+
self.deepstack_layers: list[int] = list(self.hparams_vision.get("deepstack_visual_indexes", []))
3564+
3565+
def set_gguf_parameters(self):
3566+
super().set_gguf_parameters()
3567+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
3568+
3569+
if self.hparams_vision is not None:
3570+
merge_size = self.hparams_vision.get("spatial_merge_size")
3571+
if merge_size is not None:
3572+
self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
3573+
3574+
hidden_act = (self.hparams_vision.get("hidden_act") or "").lower()
3575+
if hidden_act:
3576+
if "gelu" in hidden_act:
3577+
self.gguf_writer.add_vision_use_gelu(True)
3578+
elif hidden_act == "silu":
3579+
self.gguf_writer.add_vision_use_silu(True)
3580+
else:
3581+
raise ValueError(f"Unsupported hidden_act: {hidden_act}")
3582+
3583+
# Use text config's rms_norm_eps for vision attention layernorm eps (similar to qwen2vl)
3584+
rms_norm_eps = self.global_config.get("rms_norm_eps")
3585+
if rms_norm_eps is None:
3586+
# Try text_config
3587+
text_config = self.global_config.get("text_config", {})
3588+
rms_norm_eps = text_config.get("rms_norm_eps", 1e-6)
3589+
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
3590+
3591+
if self.deepstack_layers:
3592+
self.gguf_writer.add_vision_deepstack_layers(self.deepstack_layers)
3593+
3594+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3595+
del bid # unused
3596+
3597+
if name.startswith("model.visual."):
3598+
name = name.replace("model.visual.", "visual.", 1)
3599+
3600+
if name.startswith("visual.deepstack_merger_list."):
3601+
prefix, rest = name.split(".", maxsplit=3)[2:]
3602+
idx = int(prefix)
3603+
target = rest
3604+
3605+
tensor_type: gguf.MODEL_TENSOR
3606+
if target.startswith("norm."):
3607+
tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
3608+
suffix = target.split(".", 1)[1]
3609+
elif target.startswith("linear_fc1."):
3610+
tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
3611+
suffix = target.split(".", 1)[1]
3612+
elif target.startswith("linear_fc2."):
3613+
tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
3614+
suffix = target.split(".", 1)[1]
3615+
else:
3616+
raise ValueError(f"Unexpected deepstack tensor: {name}")
3617+
3618+
new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
3619+
return [(new_name, data_torch)]
3620+
3621+
if name.startswith("visual.merger."):
3622+
suffix = name.split(".", 2)[2]
3623+
if suffix.startswith("linear_fc"):
3624+
fc_idx_str, tail = suffix.split(".", 1)
3625+
fc_num = int(fc_idx_str.replace("linear_fc", ""))
3626+
# Qwen3VLMoe has linear_fc1 and linear_fc2
3627+
# Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
3628+
if fc_num == 1:
3629+
fc_idx = 0
3630+
elif fc_num == 2:
3631+
fc_idx = 2
3632+
else:
3633+
raise ValueError(f"unexpected fc index {fc_num} in {name}")
3634+
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
3635+
elif suffix.startswith("norm."):
3636+
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
3637+
else:
3638+
raise ValueError(f"Unexpected merger tensor: {name}")
3639+
return [(new_name, data_torch)]
3640+
3641+
if name == "visual.patch_embed.proj.weight":
3642+
# split Conv3D into Conv2Ds along temporal dimension
3643+
c1, c2, kt, _, _ = data_torch.shape
3644+
del c1, c2
3645+
if kt != 2:
3646+
raise ValueError("Current implementation only supports temporal_patch_size of 2")
3647+
return [
3648+
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]),
3649+
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
3650+
]
3651+
3652+
if name == "visual.patch_embed.proj.bias":
3653+
# Skip bias for Qwen3VL - the C++ code expects it to be null
3654+
return []
3655+
3656+
if name.startswith("visual."):
3657+
if ".qkv." in name:
3658+
if data_torch.ndim == 2:
3659+
c3, _ = data_torch.shape
3660+
else:
3661+
c3 = data_torch.shape[0]
3662+
if c3 % 3 != 0:
3663+
raise ValueError(f"Unexpected QKV shape for {name}: {data_torch.shape}")
3664+
c = c3 // 3
3665+
wq = data_torch[:c]
3666+
wk = data_torch[c: c * 2]
3667+
wv = data_torch[c * 2:]
3668+
base = name.replace("qkv", "{placeholder}")
3669+
return [
3670+
(self.map_tensor_name(base.format(placeholder="q")), wq),
3671+
(self.map_tensor_name(base.format(placeholder="k")), wk),
3672+
(self.map_tensor_name(base.format(placeholder="v")), wv),
3673+
]
3674+
3675+
return [(self.map_tensor_name(name), data_torch)]
3676+
3677+
return []
3678+
35413679
@ModelBase.register("InternVisionModel")
35423680
class InternVisionModel(MmprojModel):
35433681
def set_gguf_parameters(self):
@@ -3678,7 +3816,43 @@ def set_gguf_parameters(self):
36783816
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
36793817
# process the experts separately
36803818
name = name.replace("language_model.", "") # InternVL
3681-
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
3819+
3820+
# handle aggregated expert tensors
3821+
# GGUF stores dimensions reversed from PyTorch, so:
3822+
# PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
3823+
# Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
3824+
# Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
3825+
if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
3826+
mapped = f"{name}.weight" if not name.endswith(".weight") else name
3827+
# Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
3828+
# Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
3829+
# Need PyTorch: (128, 2048, 768) [reversed of GGML]
3830+
# So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
3831+
permuted = data_torch.permute(0, 2, 1).contiguous()
3832+
return [(self.map_tensor_name(mapped), permuted)]
3833+
3834+
if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
3835+
if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
3836+
raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
3837+
split_dim = data_torch.shape[-1] // 2
3838+
gate = data_torch[..., :split_dim].contiguous()
3839+
up = data_torch[..., split_dim:].contiguous()
3840+
# Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
3841+
# Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
3842+
# Need PyTorch: (128, 768, 2048) [reversed of GGML]
3843+
# So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
3844+
base_name = name.removesuffix(".weight")
3845+
base = base_name.rsplit('.', 1)[0]
3846+
mapped_gate = f"{base}.gate_proj.weight"
3847+
mapped_up = f"{base}.up_proj.weight"
3848+
perm_gate = gate.permute(0, 2, 1).contiguous()
3849+
perm_up = up.permute(0, 2, 1).contiguous()
3850+
return [
3851+
(self.map_tensor_name(mapped_gate), perm_gate),
3852+
(self.map_tensor_name(mapped_up), perm_up),
3853+
]
3854+
3855+
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
36823856
# skip visual tensors
36833857
return []
36843858
if name.find("experts") != -1:
@@ -3826,6 +4000,28 @@ def set_vocab(self):
38264000
super().set_vocab()
38274001

38284002

4003+
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
4004+
class Qwen3VLMoeTextModel(Qwen3MoeModel):
4005+
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
4006+
4007+
def set_gguf_parameters(self):
4008+
super().set_gguf_parameters()
4009+
4010+
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
4011+
text_config = self.hparams.get("text_config", {})
4012+
rope_scaling = text_config.get("rope_scaling") or {}
4013+
4014+
if rope_scaling.get("mrope_section"):
4015+
# mrope_section contains [time, height, width] dimensions
4016+
mrope_section = rope_scaling["mrope_section"]
4017+
# Pad to 4 dimensions [time, height, width, extra]
4018+
while len(mrope_section) < 4:
4019+
mrope_section.append(0)
4020+
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
4021+
4022+
logger.info(f"MRoPE sections: {mrope_section[:4]}")
4023+
4024+
38294025
@ModelBase.register("GPT2LMHeadModel")
38304026
class GPT2Model(TextModel):
38314027
model_arch = gguf.MODEL_ARCH.GPT2

ggml/src/ggml-cpu/ops.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5509,6 +5509,7 @@ static void ggml_mrope_cache_init(
55095509
}
55105510

55115511
float theta = theta_t;
5512+
55125513
if (sector >= sections[0] && sector < sec_w) {
55135514
theta = theta_h;
55145515
}

gguf-py/gguf/constants.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ class LLM:
109109
EXPERTS_PER_GROUP = "{arch}.experts_per_group"
110110
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
111111
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
112+
DEEPSTACK_LAYERS = "{arch}.deepstack_layers"
112113
POOLING_TYPE = "{arch}.pooling_type"
113114
LOGIT_SCALE = "{arch}.logit_scale"
114115
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
@@ -275,6 +276,7 @@ class ClipVision:
275276
USE_GELU = "clip.use_gelu"
276277
USE_SILU = "clip.use_silu"
277278
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
279+
DEEPSTACK_LAYERS = "clip.vision.deepstack_layers"
278280

279281
class Attention:
280282
HEAD_COUNT = "clip.vision.attention.head_count"
@@ -348,6 +350,7 @@ class MODEL_ARCH(IntEnum):
348350
QWEN2VL = auto()
349351
QWEN3 = auto()
350352
QWEN3MOE = auto()
353+
QWEN3VLMOE = auto()
351354
PHI2 = auto()
352355
PHI3 = auto()
353356
PHIMOE = auto()
@@ -427,6 +430,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
427430
GLM_EDGE = auto()
428431
MERGER = auto()
429432
GEMMA3 = auto()
433+
QWEN3VL = auto()
430434

431435

432436
class MODEL_TENSOR(IntEnum):
@@ -637,6 +641,9 @@ class MODEL_TENSOR(IntEnum):
637641
V_RESMPL_QUERY = auto() # minicpmv
638642
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
639643
V_MM_PATCH_MERGER = auto() # mistral small 3.1
644+
V_DS_NORM = auto() # qwen3vl deepstack norm
645+
V_DS_FC1 = auto() # qwen3vl deepstack fc1
646+
V_DS_FC2 = auto() # qwen3vl deepstack fc2
640647
# audio (mtmd)
641648
A_ENC_EMBD_POS = auto()
642649
A_ENC_CONV1D = auto()
@@ -692,6 +699,7 @@ class MODEL_TENSOR(IntEnum):
692699
MODEL_ARCH.QWEN2VL: "qwen2vl",
693700
MODEL_ARCH.QWEN3: "qwen3",
694701
MODEL_ARCH.QWEN3MOE: "qwen3moe",
702+
MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe",
695703
MODEL_ARCH.PHI2: "phi2",
696704
MODEL_ARCH.PHI3: "phi3",
697705
MODEL_ARCH.PHIMOE: "phimoe",
@@ -772,6 +780,7 @@ class MODEL_TENSOR(IntEnum):
772780
VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter",
773781
VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger",
774782
VISION_PROJECTOR_TYPE.GEMMA3: "gemma3",
783+
VISION_PROJECTOR_TYPE.QWEN3VL: "qwen3vl_merger",
775784
}
776785

777786
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -982,6 +991,9 @@ class MODEL_TENSOR(IntEnum):
982991
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
983992
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
984993
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
994+
MODEL_TENSOR.V_DS_NORM: "v.deepstack.{bid}.norm",
995+
MODEL_TENSOR.V_DS_FC1: "v.deepstack.{bid}.fc1",
996+
MODEL_TENSOR.V_DS_FC2: "v.deepstack.{bid}.fc2",
985997
# audio (mtmd)
986998
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
987999
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
@@ -1050,6 +1062,9 @@ class MODEL_TENSOR(IntEnum):
10501062
MODEL_TENSOR.V_RESMPL_QUERY,
10511063
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
10521064
MODEL_TENSOR.V_MM_PATCH_MERGER,
1065+
MODEL_TENSOR.V_DS_NORM,
1066+
MODEL_TENSOR.V_DS_FC1,
1067+
MODEL_TENSOR.V_DS_FC2,
10531068
# audio
10541069
MODEL_TENSOR.A_ENC_EMBD_POS,
10551070
MODEL_TENSOR.A_ENC_CONV1D,
@@ -1491,6 +1506,23 @@ class MODEL_TENSOR(IntEnum):
14911506
MODEL_TENSOR.FFN_DOWN_EXP,
14921507
MODEL_TENSOR.FFN_UP_EXP,
14931508
],
1509+
MODEL_ARCH.QWEN3VLMOE: [
1510+
MODEL_TENSOR.TOKEN_EMBD,
1511+
MODEL_TENSOR.OUTPUT_NORM,
1512+
MODEL_TENSOR.OUTPUT,
1513+
MODEL_TENSOR.ATTN_NORM,
1514+
MODEL_TENSOR.ATTN_Q,
1515+
MODEL_TENSOR.ATTN_Q_NORM,
1516+
MODEL_TENSOR.ATTN_K,
1517+
MODEL_TENSOR.ATTN_K_NORM,
1518+
MODEL_TENSOR.ATTN_V,
1519+
MODEL_TENSOR.ATTN_OUT,
1520+
MODEL_TENSOR.FFN_NORM,
1521+
MODEL_TENSOR.FFN_GATE_INP,
1522+
MODEL_TENSOR.FFN_GATE_EXP,
1523+
MODEL_TENSOR.FFN_DOWN_EXP,
1524+
MODEL_TENSOR.FFN_UP_EXP,
1525+
],
14941526
MODEL_ARCH.PLAMO: [
14951527
MODEL_TENSOR.TOKEN_EMBD,
14961528
MODEL_TENSOR.OUTPUT_NORM,
@@ -3022,6 +3054,7 @@ class VisionProjectorType:
30223054
LLAMA4 = "llama4"
30233055
QWEN2VL = "qwen2vl_merger"
30243056
QWEN25VL = "qwen2.5vl_merger"
3057+
QWEN3VL = "qwen3vl_merger"
30253058
ULTRAVOX = "ultravox"
30263059
INTERNVL = "internvl"
30273060
QWEN2A = "qwen2a" # audio

gguf-py/gguf/gguf_writer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,9 @@ def add_moe_every_n_layers(self, value: int) -> None:
776776
def add_nextn_predict_layers(self, count: int) -> None:
777777
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
778778

779+
def add_deepstack_layers(self, layers: Sequence[int]) -> None:
780+
self.add_array(Keys.LLM.DEEPSTACK_LAYERS.format(arch=self.arch), layers)
781+
779782
def add_swin_norm(self, value: bool) -> None:
780783
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
781784

@@ -1065,6 +1068,9 @@ def add_vision_projector_scale_factor(self, value: int) -> None:
10651068
def add_vision_n_wa_pattern(self, value: int) -> None:
10661069
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
10671070

1071+
def add_vision_deepstack_layers(self, layers: Sequence[int]) -> None:
1072+
self.add_array(Keys.ClipVision.DEEPSTACK_LAYERS, layers)
1073+
10681074
# audio models
10691075

10701076
def add_audio_projection_dim(self, value: int) -> None:

gguf-py/gguf/tensor_mapping.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,7 @@ class TensorNameMap:
11791179
"model.vision_model.embeddings.position_embedding", # SmolVLM
11801180
"vision_model.positional_embedding_vlm", # llama 4
11811181
"vision_tower.patch_embed.pos_emb", # kimi-vl
1182+
"visual.pos_embed", # qwen3vlmoe
11821183
),
11831184

11841185
MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1275,6 +1276,7 @@ class TensorNameMap:
12751276
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
12761277
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
12771278
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
1279+
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vlmoe
12781280
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
12791281
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
12801282
),
@@ -1294,6 +1296,7 @@ class TensorNameMap:
12941296
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
12951297
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
12961298
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
1299+
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vlmoe
12971300
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
12981301
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
12991302
),
@@ -1391,6 +1394,18 @@ class TensorNameMap:
13911394
"patch_merger.merging_layer", # mistral
13921395
),
13931396

1397+
MODEL_TENSOR.V_DS_NORM: (
1398+
"model.visual.deepstack_merger_list.{bid}.norm",
1399+
),
1400+
1401+
MODEL_TENSOR.V_DS_FC1: (
1402+
"model.visual.deepstack_merger_list.{bid}.linear_fc1",
1403+
),
1404+
1405+
MODEL_TENSOR.V_DS_FC2: (
1406+
"model.visual.deepstack_merger_list.{bid}.linear_fc2",
1407+
),
1408+
13941409
# audio (mtmd)
13951410

13961411
MODEL_TENSOR.A_ENC_EMBD_POS: (

0 commit comments

Comments
 (0)