Skip to content

Commit d261223

Browse files
JJJYmmmThireusyairpatchLETS-BEEngxson
authored
model: add support for qwen3vl series (#16780)
* support qwen3vl series. Co-authored-by: Thireus ☠ <[email protected]> Co-authored-by: yairpatch <[email protected]> Co-authored-by: LETS-BEE <[email protected]> * bugfix: fix the arch check for qwen3vl-moe. * use build_ffn * optimize deepstack structure * optimize deepstack feature saving * Revert "optimize deepstack feature saving" for temporal fix This reverts commit f321b9f. * code clean * use fused qkv in clip * clean up / rm is_deepstack_layers for simplification * add test model * move test model to "big" section * fix imrope check * remove trailing whitespace * fix rope fail * metal : add imrope support * add imrope support for sycl * vulkan: add imrope w/o check * fix vulkan * webgpu: add imrope w/o check * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Sigbjørn Skjæret <[email protected]> * fix tensor mapping --------- Co-authored-by: Thireus ☠ <[email protected]> Co-authored-by: yairpatch <[email protected]> Co-authored-by: LETS-BEE <[email protected]> Co-authored-by: Xuan Son Nguyen <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: Sigbjørn Skjæret <[email protected]>
1 parent dcca0d3 commit d261223

File tree

28 files changed

+1125
-97
lines changed

28 files changed

+1125
-97
lines changed

convert_hf_to_gguf.py

Lines changed: 218 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3852,7 +3852,43 @@ def set_gguf_parameters(self):
38523852
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
38533853
# process the experts separately
38543854
name = name.replace("language_model.", "") # InternVL
3855-
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
3855+
3856+
# handle aggregated expert tensors
3857+
# GGUF stores dimensions reversed from PyTorch, so:
3858+
# PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
3859+
# Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
3860+
# Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
3861+
if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
3862+
mapped = f"{name}.weight" if not name.endswith(".weight") else name
3863+
# Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
3864+
# Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
3865+
# Need PyTorch: (128, 2048, 768) [reversed of GGML]
3866+
# So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
3867+
permuted = data_torch.permute(0, 2, 1).contiguous()
3868+
return [(self.map_tensor_name(mapped), permuted)]
3869+
3870+
if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
3871+
if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
3872+
raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
3873+
split_dim = data_torch.shape[-1] // 2
3874+
gate = data_torch[..., :split_dim].contiguous()
3875+
up = data_torch[..., split_dim:].contiguous()
3876+
# Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
3877+
# Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
3878+
# Need PyTorch: (128, 768, 2048) [reversed of GGML]
3879+
# So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
3880+
base_name = name.removesuffix(".weight")
3881+
base = base_name.rsplit('.', 1)[0]
3882+
mapped_gate = f"{base}.gate_proj.weight"
3883+
mapped_up = f"{base}.up_proj.weight"
3884+
perm_gate = gate.permute(0, 2, 1).contiguous()
3885+
perm_up = up.permute(0, 2, 1).contiguous()
3886+
return [
3887+
(self.map_tensor_name(mapped_gate), perm_gate),
3888+
(self.map_tensor_name(mapped_up), perm_up),
3889+
]
3890+
3891+
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
38563892
# skip visual tensors
38573893
return []
38583894
if name.find("experts") != -1:
@@ -4004,6 +4040,187 @@ def set_vocab(self):
40044040
super().set_vocab()
40054041

40064042

4043+
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
4044+
class Qwen3VLVisionModel(MmprojModel):
4045+
def __init__(self, *args, **kwargs):
4046+
super().__init__(*args, **kwargs)
4047+
assert self.hparams_vision is not None
4048+
# Compute image_size if not present
4049+
if "image_size" not in self.hparams_vision:
4050+
# For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
4051+
num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
4052+
patch_size = self.hparams_vision.get("patch_size", 16)
4053+
# num_position_embeddings = (image_size / patch_size) ** 2
4054+
# So image_size = sqrt(num_position_embeddings) * patch_size
4055+
image_size = int(num_pos**0.5 * patch_size)
4056+
self.hparams_vision["image_size"] = image_size
4057+
4058+
# Rename config values for compatibility
4059+
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
4060+
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
4061+
4062+
self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0)
4063+
for idx in self.hparams_vision.get("deepstack_visual_indexes", []):
4064+
self.is_deepstack_layers[idx] = True
4065+
4066+
def set_gguf_parameters(self):
4067+
super().set_gguf_parameters()
4068+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
4069+
self.gguf_writer.add_vision_use_gelu(True)
4070+
4071+
if self.hparams_vision is not None:
4072+
merge_size = self.hparams_vision.get("spatial_merge_size")
4073+
if merge_size is not None:
4074+
self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
4075+
4076+
# Use text config's rms_norm_eps for vision attention layernorm eps
4077+
rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6)
4078+
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
4079+
4080+
if self.is_deepstack_layers:
4081+
self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers)
4082+
4083+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4084+
assert self.hparams_vision is not None
4085+
# Skip text model tensors - they go in the text model file
4086+
if name.startswith("model.language_model.") or name.startswith("lm_head."):
4087+
return []
4088+
4089+
if name.startswith("model.visual."):
4090+
name = name.replace("model.visual.", "visual.", 1)
4091+
4092+
if name.startswith("visual.deepstack_merger_list."):
4093+
prefix, rest = name.split(".", maxsplit=3)[2:]
4094+
# prefix is the layer index, convert to absolute clip layer index!
4095+
idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)]
4096+
target = rest
4097+
4098+
tensor_type: gguf.MODEL_TENSOR
4099+
if target.startswith("norm."):
4100+
tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
4101+
suffix = target.split(".", 1)[1]
4102+
elif target.startswith("linear_fc1."):
4103+
tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
4104+
suffix = target.split(".", 1)[1]
4105+
elif target.startswith("linear_fc2."):
4106+
tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
4107+
suffix = target.split(".", 1)[1]
4108+
else:
4109+
raise ValueError(f"Unexpected deepstack tensor: {name}")
4110+
4111+
new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
4112+
return [(new_name, data_torch)]
4113+
4114+
if name.startswith("visual.merger."):
4115+
suffix = name.split(".", 2)[2]
4116+
if suffix.startswith("linear_fc"):
4117+
fc_idx_str, tail = suffix.split(".", 1)
4118+
fc_num = int(fc_idx_str.replace("linear_fc", ""))
4119+
# Qwen3VL has linear_fc1 and linear_fc2
4120+
# Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
4121+
if fc_num == 1:
4122+
fc_idx = 0
4123+
elif fc_num == 2:
4124+
fc_idx = 2
4125+
else:
4126+
raise ValueError(f"unexpected fc index {fc_num} in {name}")
4127+
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
4128+
elif suffix.startswith("norm."):
4129+
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
4130+
else:
4131+
raise ValueError(f"Unexpected merger tensor: {name}")
4132+
return [(new_name, data_torch)]
4133+
4134+
if name == "visual.patch_embed.proj.weight":
4135+
# split Conv3D into Conv2Ds along temporal dimension
4136+
c1, c2, kt, _, _ = data_torch.shape
4137+
del c1, c2
4138+
if kt != 2:
4139+
raise ValueError("Current implementation only supports temporal_patch_size of 2")
4140+
return [
4141+
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]),
4142+
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
4143+
]
4144+
4145+
if name == "visual.patch_embed.proj.bias":
4146+
# Include the bias - it's used by the C++ code
4147+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)]
4148+
4149+
if name.startswith("visual."):
4150+
return [(self.map_tensor_name(name), data_torch)]
4151+
4152+
# Fall back to parent class for other tensors
4153+
return super().modify_tensors(data_torch, name, bid)
4154+
4155+
4156+
@ModelBase.register("Qwen3VLForConditionalGeneration")
4157+
class Qwen3VLTextModel(Qwen3Model):
4158+
model_arch = gguf.MODEL_ARCH.QWEN3VL
4159+
4160+
def set_gguf_parameters(self):
4161+
super().set_gguf_parameters()
4162+
4163+
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
4164+
text_config = self.hparams.get("text_config", {})
4165+
# rope_scaling is deprecated in V5, use rope_parameters instead
4166+
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
4167+
4168+
if rope_scaling.get("mrope_section"):
4169+
# mrope_section contains [time, height, width] dimensions
4170+
mrope_section = rope_scaling["mrope_section"]
4171+
# Pad to 4 dimensions [time, height, width, extra]
4172+
while len(mrope_section) < 4:
4173+
mrope_section.append(0)
4174+
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
4175+
4176+
logger.info(f"MRoPE sections: {mrope_section[:4]}")
4177+
4178+
vision_config = self.hparams.get("vision_config", {})
4179+
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
4180+
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
4181+
4182+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4183+
# Skip vision tensors - they go in the mmproj file
4184+
if name.startswith("model.visual."):
4185+
return []
4186+
4187+
return super().modify_tensors(data_torch, name, bid)
4188+
4189+
4190+
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
4191+
class Qwen3VLMoeTextModel(Qwen3MoeModel):
4192+
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
4193+
4194+
def set_gguf_parameters(self):
4195+
super().set_gguf_parameters()
4196+
4197+
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
4198+
text_config = self.hparams.get("text_config", {})
4199+
# rope_scaling is deprecated in V5, use rope_parameters instead
4200+
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
4201+
4202+
if rope_scaling.get("mrope_section"):
4203+
# mrope_section contains [time, height, width] dimensions
4204+
mrope_section = rope_scaling["mrope_section"]
4205+
# Pad to 4 dimensions [time, height, width, extra]
4206+
while len(mrope_section) < 4:
4207+
mrope_section.append(0)
4208+
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
4209+
4210+
logger.info(f"MRoPE sections: {mrope_section[:4]}")
4211+
4212+
vision_config = self.hparams.get("vision_config", {})
4213+
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
4214+
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
4215+
4216+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4217+
# Skip vision tensors - they go in the mmproj file
4218+
if name.startswith("model.visual."):
4219+
return []
4220+
4221+
return super().modify_tensors(data_torch, name, bid)
4222+
4223+
40074224
@ModelBase.register("GPT2LMHeadModel")
40084225
class GPT2Model(TextModel):
40094226
model_arch = gguf.MODEL_ARCH.GPT2

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@
242242
#define GGML_ROPE_TYPE_NEOX 2
243243
#define GGML_ROPE_TYPE_MROPE 8
244244
#define GGML_ROPE_TYPE_VISION 24
245+
#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
245246

246247
#define GGML_MROPE_SECTIONS 4
247248

ggml/src/ggml-cpu/ops.cpp

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5474,7 +5474,7 @@ static void ggml_rope_cache_init(
54745474
}
54755475

54765476
static void ggml_mrope_cache_init(
5477-
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
5477+
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool is_imrope, bool indep_sects,
54785478
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
54795479
float * cache, float sin_sign, float theta_scale) {
54805480
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@@ -5509,14 +5509,26 @@ static void ggml_mrope_cache_init(
55095509
}
55105510

55115511
float theta = theta_t;
5512-
if (sector >= sections[0] && sector < sec_w) {
5513-
theta = theta_h;
5514-
}
5515-
else if (sector >= sec_w && sector < sec_w + sections[2]) {
5516-
theta = theta_w;
5517-
}
5518-
else if (sector >= sec_w + sections[2]) {
5519-
theta = theta_e;
5512+
if (is_imrope) { // qwen3vl apply interleaved mrope
5513+
if (sector % 3 == 1 && sector < 3 * sections[1]) {
5514+
theta = theta_h;
5515+
} else if (sector % 3 == 2 && sector < 3 * sections[2]) {
5516+
theta = theta_w;
5517+
} else if (sector % 3 == 0 && sector < 3 * sections[0]) {
5518+
theta = theta_t;
5519+
} else {
5520+
theta = theta_e;
5521+
}
5522+
} else {
5523+
if (sector >= sections[0] && sector < sec_w) {
5524+
theta = theta_h;
5525+
}
5526+
else if (sector >= sec_w && sector < sec_w + sections[2]) {
5527+
theta = theta_w;
5528+
}
5529+
else if (sector >= sec_w + sections[2]) {
5530+
theta = theta_e;
5531+
}
55205532
}
55215533

55225534
rope_yarn(
@@ -5589,6 +5601,7 @@ static void ggml_compute_forward_rope_f32(
55895601

55905602
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
55915603
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
5604+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
55925605
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
55935606

55945607
if (is_mrope) {
@@ -5627,7 +5640,7 @@ static void ggml_compute_forward_rope_f32(
56275640
const int64_t p_w = pos[i2 + ne2 * 2];
56285641
const int64_t p_e = pos[i2 + ne2 * 3];
56295642
ggml_mrope_cache_init(
5630-
p_t, p_h, p_w, p_e, sections, is_vision,
5643+
p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
56315644
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
56325645
}
56335646

@@ -5775,6 +5788,7 @@ static void ggml_compute_forward_rope_f16(
57755788

57765789
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
57775790
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
5791+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
57785792
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
57795793

57805794
if (is_mrope) {
@@ -5813,7 +5827,7 @@ static void ggml_compute_forward_rope_f16(
58135827
const int64_t p_w = pos[i2 + ne2 * 2];
58145828
const int64_t p_e = pos[i2 + ne2 * 3];
58155829
ggml_mrope_cache_init(
5816-
p_t, p_h, p_w, p_e, sections, is_vision,
5830+
p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
58175831
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
58185832
}
58195833

0 commit comments

Comments
 (0)