Thireus
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 197 additions & 1 deletion b/‎convert_hf_to_gguf.py‎
Lines changed: 197 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎gguf-py/gguf/constants.py‎
Lines changed: 33 additions & 0 deletions b/‎gguf-py/gguf/constants.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎gguf-py/gguf/gguf_writer.py‎
Lines changed: 6 additions & 0 deletions b/‎gguf-py/gguf/gguf_writer.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎gguf-py/gguf/tensor_mapping.py‎
Lines changed: 15 additions & 0 deletions b/‎gguf-py/gguf/tensor_mapping.py‎
Lines changed: 15 additions & 0 deletions
@@ -3538,6 +3538,144 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
+class Qwen3VLMoeVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.has_vision_encoder
+        assert self.hparams_vision is not None
+
+        # Compute image_size if not present
+        if "image_size" not in self.hparams_vision:
+            # For Qwen3VLMoe, compute from num_position_embeddings
+            num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
+            patch_size = self.hparams_vision.get("patch_size", 16)
+            # num_position_embeddings = (image_size / patch_size) ** 2
+            # So image_size = sqrt(num_position_embeddings) * patch_size
+            import math
+            image_size = int(math.sqrt(num_pos) * patch_size)
+            self.hparams_vision["image_size"] = image_size
+
+        # Rename config values for compatibility
+        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
+        self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
+
+        self.deepstack_layers: list[int] = list(self.hparams_vision.get("deepstack_visual_indexes", []))
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
+
+        if self.hparams_vision is not None:
+            merge_size = self.hparams_vision.get("spatial_merge_size")
+            if merge_size is not None:
+                self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
+
+            hidden_act = (self.hparams_vision.get("hidden_act") or "").lower()
+            if hidden_act:
+                if "gelu" in hidden_act:
+                    self.gguf_writer.add_vision_use_gelu(True)
+                elif hidden_act == "silu":
+                    self.gguf_writer.add_vision_use_silu(True)
+                else:
+                    raise ValueError(f"Unsupported hidden_act: {hidden_act}")
+
+        # Use text config's rms_norm_eps for vision attention layernorm eps (similar to qwen2vl)
+        rms_norm_eps = self.global_config.get("rms_norm_eps")
+        if rms_norm_eps is None:
+            # Try text_config
+            text_config = self.global_config.get("text_config", {})
+            rms_norm_eps = text_config.get("rms_norm_eps", 1e-6)
+        self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
+
+        if self.deepstack_layers:
+            self.gguf_writer.add_vision_deepstack_layers(self.deepstack_layers)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if name.startswith("model.visual."):
+            name = name.replace("model.visual.", "visual.", 1)
+
+        if name.startswith("visual.deepstack_merger_list."):
+            prefix, rest = name.split(".", maxsplit=3)[2:]
+            idx = int(prefix)
+            target = rest
+
+            tensor_type: gguf.MODEL_TENSOR
+            if target.startswith("norm."):
+                tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
+                suffix = target.split(".", 1)[1]
+            elif target.startswith("linear_fc1."):
+                tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
+                suffix = target.split(".", 1)[1]
+            elif target.startswith("linear_fc2."):
+                tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
+                suffix = target.split(".", 1)[1]
+            else:
+                raise ValueError(f"Unexpected deepstack tensor: {name}")
+
+            new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
+            return [(new_name, data_torch)]
+
+        if name.startswith("visual.merger."):
+            suffix = name.split(".", 2)[2]
+            if suffix.startswith("linear_fc"):
+                fc_idx_str, tail = suffix.split(".", 1)
+                fc_num = int(fc_idx_str.replace("linear_fc", ""))
+                # Qwen3VLMoe has linear_fc1 and linear_fc2
+                # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
+                if fc_num == 1:
+                    fc_idx = 0
+                elif fc_num == 2:
+                    fc_idx = 2
+                else:
+                    raise ValueError(f"unexpected fc index {fc_num} in {name}")
+                new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
+            elif suffix.startswith("norm."):
+                new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
+            else:
+                raise ValueError(f"Unexpected merger tensor: {name}")
+            return [(new_name, data_torch)]
+
+        if name == "visual.patch_embed.proj.weight":
+            # split Conv3D into Conv2Ds along temporal dimension
+            c1, c2, kt, _, _ = data_torch.shape
+            del c1, c2
+            if kt != 2:
+                raise ValueError("Current implementation only supports temporal_patch_size of 2")
+            return [
+                (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]),
+                (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
+            ]
+
+        if name == "visual.patch_embed.proj.bias":
+            # Skip bias for Qwen3VL - the C++ code expects it to be null
+            return []
+
+        if name.startswith("visual."):
+            if ".qkv." in name:
+                if data_torch.ndim == 2:
+                    c3, _ = data_torch.shape
+                else:
+                    c3 = data_torch.shape[0]
+                if c3 % 3 != 0:
+                    raise ValueError(f"Unexpected QKV shape for {name}: {data_torch.shape}")
+                c = c3 // 3
+                wq = data_torch[:c]
+                wk = data_torch[c: c * 2]
+                wv = data_torch[c * 2:]
+                base = name.replace("qkv", "{placeholder}")
+                return [
+                    (self.map_tensor_name(base.format(placeholder="q")), wq),
+                    (self.map_tensor_name(base.format(placeholder="k")), wk),
+                    (self.map_tensor_name(base.format(placeholder="v")), wv),
+                ]
+
+            return [(self.map_tensor_name(name), data_torch)]
+
+        return []
+
 @ModelBase.register("InternVisionModel")
 class InternVisionModel(MmprojModel):
     def set_gguf_parameters(self):
@@ -3678,7 +3816,43 @@ def set_gguf_parameters(self):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # process the experts separately
         name = name.replace("language_model.", "") # InternVL
-        if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
+
+        # handle aggregated expert tensors
+        # GGUF stores dimensions reversed from PyTorch, so:
+        # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
+        # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
+        # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
+        if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
+            mapped = f"{name}.weight" if not name.endswith(".weight") else name
+            # Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
+            # Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
+            # Need PyTorch: (128, 2048, 768) [reversed of GGML]
+            # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
+            permuted = data_torch.permute(0, 2, 1).contiguous()
+            return [(self.map_tensor_name(mapped), permuted)]
+
+        if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
+            if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
+                raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
+            split_dim = data_torch.shape[-1] // 2
+            gate = data_torch[..., :split_dim].contiguous()
+            up = data_torch[..., split_dim:].contiguous()
+            # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
+            # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
+            # Need PyTorch: (128, 768, 2048) [reversed of GGML]
+            # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
+            base_name = name.removesuffix(".weight")
+            base = base_name.rsplit('.', 1)[0]
+            mapped_gate = f"{base}.gate_proj.weight"
+            mapped_up = f"{base}.up_proj.weight"
+            perm_gate = gate.permute(0, 2, 1).contiguous()
+            perm_up = up.permute(0, 2, 1).contiguous()
+            return [
+                (self.map_tensor_name(mapped_gate), perm_gate),
+                (self.map_tensor_name(mapped_up), perm_up),
+            ]
+
+        if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
             # skip visual tensors
             return []
         if name.find("experts") != -1:
@@ -3826,6 +4000,28 @@ def set_vocab(self):
         super().set_vocab()
 
 
+@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
+class Qwen3VLMoeTextModel(Qwen3MoeModel):
+    model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
+        text_config = self.hparams.get("text_config", {})
+        rope_scaling = text_config.get("rope_scaling") or {}
+
+        if rope_scaling.get("mrope_section"):
+            # mrope_section contains [time, height, width] dimensions
+            mrope_section = rope_scaling["mrope_section"]
+            # Pad to 4 dimensions [time, height, width, extra]
+            while len(mrope_section) < 4:
+                mrope_section.append(0)
+            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
+
+            logger.info(f"MRoPE sections: {mrope_section[:4]}")
+
+
 @ModelBase.register("GPT2LMHeadModel")
 class GPT2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GPT2
 
@@ -5509,6 +5509,7 @@ static void ggml_mrope_cache_init(
         }
 
         float theta = theta_t;
+
         if (sector >= sections[0] && sector < sec_w) {
             theta = theta_h;
         }
 
@@ -109,6 +109,7 @@ class LLM:
         EXPERTS_PER_GROUP                 = "{arch}.experts_per_group"
         MOE_EVERY_N_LAYERS                = "{arch}.moe_every_n_layers"
         NEXTN_PREDICT_LAYERS              = "{arch}.nextn_predict_layers"
+        DEEPSTACK_LAYERS                  = "{arch}.deepstack_layers"
         POOLING_TYPE                      = "{arch}.pooling_type"
         LOGIT_SCALE                       = "{arch}.logit_scale"
         DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
@@ -275,6 +276,7 @@ class ClipVision:
         USE_GELU            = "clip.use_gelu"
         USE_SILU            = "clip.use_silu"
         N_WA_PATTERN        = "clip.vision.n_wa_pattern" # used by qwen2.5vl
+        DEEPSTACK_LAYERS    = "clip.vision.deepstack_layers"
 
         class Attention:
             HEAD_COUNT      = "clip.vision.attention.head_count"
@@ -348,6 +350,7 @@ class MODEL_ARCH(IntEnum):
     QWEN2VL          = auto()
     QWEN3            = auto()
     QWEN3MOE         = auto()
+    QWEN3VLMOE       = auto()
     PHI2             = auto()
     PHI3             = auto()
     PHIMOE           = auto()
@@ -427,6 +430,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
     GLM_EDGE  = auto()
     MERGER    = auto()
     GEMMA3    = auto()
+    QWEN3VL   = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -637,6 +641,9 @@ class MODEL_TENSOR(IntEnum):
     V_RESMPL_QUERY       = auto() # minicpmv
     V_TOK_EMBD_IMG_BREAK = auto() # pixtral
     V_MM_PATCH_MERGER    = auto() # mistral small 3.1
+    V_DS_NORM            = auto() # qwen3vl deepstack norm
+    V_DS_FC1             = auto() # qwen3vl deepstack fc1
+    V_DS_FC2             = auto() # qwen3vl deepstack fc2
     # audio (mtmd)
     A_ENC_EMBD_POS       = auto()
     A_ENC_CONV1D         = auto()
@@ -692,6 +699,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.QWEN2VL:          "qwen2vl",
     MODEL_ARCH.QWEN3:            "qwen3",
     MODEL_ARCH.QWEN3MOE:         "qwen3moe",
+    MODEL_ARCH.QWEN3VLMOE:       "qwen3vlmoe",
     MODEL_ARCH.PHI2:             "phi2",
     MODEL_ARCH.PHI3:             "phi3",
     MODEL_ARCH.PHIMOE:           "phimoe",
@@ -772,6 +780,7 @@ class MODEL_TENSOR(IntEnum):
     VISION_PROJECTOR_TYPE.GLM_EDGE:  "adapter",
     VISION_PROJECTOR_TYPE.MERGER:    "qwen2vl_merger",
     VISION_PROJECTOR_TYPE.GEMMA3:    "gemma3",
+    VISION_PROJECTOR_TYPE.QWEN3VL:   "qwen3vl_merger",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -982,6 +991,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_RESMPL_QUERY:            "resampler.query",
     MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK:      "v.token_embd.img_break", # pixtral
     MODEL_TENSOR.V_MM_PATCH_MERGER:         "mm.patch_merger", # mistral small 3.1
+    MODEL_TENSOR.V_DS_NORM:                 "v.deepstack.{bid}.norm",
+    MODEL_TENSOR.V_DS_FC1:                  "v.deepstack.{bid}.fc1",
+    MODEL_TENSOR.V_DS_FC2:                  "v.deepstack.{bid}.fc2",
     # audio (mtmd)
     MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
     MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
@@ -1050,6 +1062,9 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_RESMPL_QUERY,
         MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
         MODEL_TENSOR.V_MM_PATCH_MERGER,
+        MODEL_TENSOR.V_DS_NORM,
+        MODEL_TENSOR.V_DS_FC1,
+        MODEL_TENSOR.V_DS_FC2,
         # audio
         MODEL_TENSOR.A_ENC_EMBD_POS,
         MODEL_TENSOR.A_ENC_CONV1D,
@@ -1491,6 +1506,23 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.QWEN3VLMOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
     MODEL_ARCH.PLAMO: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -3022,6 +3054,7 @@ class VisionProjectorType:
     LLAMA4 = "llama4"
     QWEN2VL = "qwen2vl_merger"
     QWEN25VL = "qwen2.5vl_merger"
+    QWEN3VL = "qwen3vl_merger"
     ULTRAVOX = "ultravox"
     INTERNVL = "internvl"
     QWEN2A = "qwen2a" # audio
 
@@ -776,6 +776,9 @@ def add_moe_every_n_layers(self, value: int) -> None:
     def add_nextn_predict_layers(self, count: int) -> None:
         self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
 
+    def add_deepstack_layers(self, layers: Sequence[int]) -> None:
+        self.add_array(Keys.LLM.DEEPSTACK_LAYERS.format(arch=self.arch), layers)
+
     def add_swin_norm(self, value: bool) -> None:
         self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
 
@@ -1065,6 +1068,9 @@ def add_vision_projector_scale_factor(self, value: int) -> None:
     def add_vision_n_wa_pattern(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
 
+    def add_vision_deepstack_layers(self, layers: Sequence[int]) -> None:
+        self.add_array(Keys.ClipVision.DEEPSTACK_LAYERS, layers)
+
     # audio models
 
     def add_audio_projection_dim(self, value: int) -> None:
 
@@ -1179,6 +1179,7 @@ class TensorNameMap:
             "model.vision_model.embeddings.position_embedding", # SmolVLM
             "vision_model.positional_embedding_vlm", # llama 4
             "vision_tower.patch_embed.pos_emb", # kimi-vl
+            "visual.pos_embed", # qwen3vlmoe
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1275,6 +1276,7 @@ class TensorNameMap:
             "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
             "vision_model.model.layers.{bid}.mlp.fc1", # llama4
             "visual.blocks.{bid}.mlp.fc1", # qwen2vl
+            "visual.blocks.{bid}.mlp.linear_fc1", # qwen3vlmoe
             "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
             "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
         ),
@@ -1294,6 +1296,7 @@ class TensorNameMap:
             "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
             "vision_model.model.layers.{bid}.mlp.fc2", # llama4
             "visual.blocks.{bid}.mlp.fc2", # qwen2vl
+            "visual.blocks.{bid}.mlp.linear_fc2", # qwen3vlmoe
             "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
             "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
         ),
@@ -1391,6 +1394,18 @@ class TensorNameMap:
             "patch_merger.merging_layer", # mistral
         ),
 
+        MODEL_TENSOR.V_DS_NORM: (
+            "model.visual.deepstack_merger_list.{bid}.norm",
+        ),
+
+        MODEL_TENSOR.V_DS_FC1: (
+            "model.visual.deepstack_merger_list.{bid}.linear_fc1",
+        ),
+
+        MODEL_TENSOR.V_DS_FC2: (
+            "model.visual.deepstack_merger_list.{bid}.linear_fc2",
+        ),
+
         # audio (mtmd)
 
         MODEL_TENSOR.A_ENC_EMBD_POS: (
Original file line number	Diff line number	Diff line change
`@@ -5509,6 +5509,7 @@ static void ggml_mrope_cache_init(`
`5509`	`5509`	`}`
`5510`	`5510`
`5511`	`5511`	`float theta = theta_t;`
	`5512`	`+`
`5512`	`5513`	`if (sector >= sections[0] && sector < sec_w) {`
`5513`	`5514`	`theta = theta_h;`
`5514`	`5515`	`}`