diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 51f0d1ab152..05d8e992846 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4250,9 +4250,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Qwen2_5OmniModel") -class Qwen25OmniModel(Qwen2VLVisionModel): - has_vision_encoder = True +class Qwen25AudioModel(MmprojModel): has_audio_encoder = True def __init__(self, *args, **kwargs): @@ -4268,12 +4266,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"]) self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5)) - def get_vision_config(self) -> dict[str, Any] | None: - return self.global_config["thinker_config"].get("vision_config") - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config["thinker_config"].get("audio_config") - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # SinusoidsPositionEmbedding assert self.hparams_audio is not None @@ -4303,8 +4295,33 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "audio_bos_eos_token" in name: # this tensor is left unused in transformers code # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809 - return - yield from super().modify_tensors(data_torch, name, bid) + return [] + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("Qwen2_5OmniModel") +class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel): + has_audio_encoder = True + has_vision_encoder = True + + def get_vision_config(self) -> dict[str, Any] | None: + return self.global_config["thinker_config"].get("vision_config") + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config["thinker_config"].get("audio_config") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "visual." in name: + yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid) + elif "audio_tower." in name: + yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) + return [] # skip other tensors @ModelBase.register("InternVisionModel") @@ -4808,7 +4825,10 @@ def set_gguf_parameters(self): class Qwen3VLVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - assert self.hparams_vision is not None + if self.hparams_vision is None: + logger.info("No vision config found, skipping vision tensor processing") + return + # Compute image_size if not present if "image_size" not in self.hparams_vision: # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings @@ -4829,7 +4849,9 @@ def __init__(self, *args, **kwargs): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL) + # in case mixed modalities, the arch will be handled by subclass + if not self.has_audio_encoder: + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL) self.gguf_writer.add_vision_use_gelu(True) if self.hparams_vision is not None: @@ -4917,11 +4939,64 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return if name.startswith("visual."): - yield from super().modify_tensors(data_torch, name, bid) - return + yield (self.map_tensor_name(name), data_torch) + return [] # skip other tensors - # Fall back to parent class for other tensors - yield from super().modify_tensors(data_torch, name, bid) + +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): + has_audio_encoder = True + has_vision_encoder = True + + def get_vision_config(self) -> dict[str, Any] | None: + if self.has_vision_encoder: + return self.global_config["thinker_config"].get("vision_config") + else: + return None + + def get_audio_config(self) -> dict[str, Any] | None: + if self.has_audio_encoder: + return self.global_config["thinker_config"].get("audio_config") + else: + return None + + def set_gguf_parameters(self): + if self.has_vision_encoder: + Qwen3VLVisionModel.set_gguf_parameters(self) + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL) + if self.has_audio_encoder: + Qwen25AudioModel.set_gguf_parameters(self) + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "visual." in name: + if not self.has_vision_encoder: + raise ValueError(f"Model does not have vision encoder, but found tensor {name}") + # need to transform vision tensor naming, so that modify_tensors() logic can be used correctly + name = name.replace("thinker.visual.", "model.visual.") + if ".merger_list." in name: + name = name.replace(".merger_list.", ".deepstack_merger_list.") + name = name.replace(".ln_q", ".norm") + name = name.replace(".mlp.0", ".linear_fc1") + name = name.replace(".mlp.2", ".linear_fc2") + elif ".merger." in name: + name = name.replace(".ln_q", ".norm") + name = name.replace(".mlp.0", ".linear_fc1") + name = name.replace(".mlp.2", ".linear_fc2") + yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid) + elif "audio_tower." in name: + if not self.has_audio_encoder: + raise ValueError(f"Model does not have audio encoder, but found tensor {name}") + if "conv2d" in name and name.endswith(".bias"): + # transform conv2d bias [n_embd] --> [1, 1, n_embd] + data_torch = data_torch.unsqueeze(-1).unsqueeze(-1) + yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) + + +@ModelBase.register("Qwen3ASRForConditionalGeneration") +class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel): + has_audio_encoder = True + has_vision_encoder = False @ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration") @@ -4955,9 +5030,10 @@ class Qwen3VLTextModel(Qwen3Model): def set_gguf_parameters(self): super().set_gguf_parameters() - - # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL - vision_config = self.hparams.get("vision_config", {}) + if "thinker_config" in self.hparams: + vision_config = self.hparams["thinker_config"].get("vision_config", {}) + else: + vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) @@ -4969,20 +5045,66 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Qwen3VLMoeForConditionalGeneration") +@ModelBase.register("Qwen3ASRForConditionalGeneration") +class Qwen3ASRTextModel(Qwen3VLTextModel): + model_arch = gguf.MODEL_ARCH.QWEN3VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_num_deepstack_layers(0) + + def set_vocab(self): + super().set_vocab() + # fix chat template, use correct chatml format + self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}") + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + + def modify_tensors(self, data_torch, name, bid): + # qwen3-omni + name = name.replace("thinker.", "") + + # Skip vision and audio tensors - they go in the mmproj file + if "visual." in name or "audio_tower." in name \ + or "talker." in name or "code2wav." in name: + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3VLMoeForConditionalGeneration", "Qwen3OmniMoeForConditionalGeneration") class Qwen3VLMoeTextModel(Qwen3MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3VLMOE + def set_vocab(self): + super().set_vocab() + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + def set_gguf_parameters(self): super().set_gguf_parameters() - vision_config = self.hparams.get("vision_config", {}) - deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) - self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) + self.gguf_writer.add_num_deepstack_layers(0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Skip vision tensors - they go in the mmproj file - if name.startswith("model.visual."): - return + if "visual." in name or "audio_tower." in name \ + or "talker." in name or "code2wav." in name: + return [] + + # qwen3-omni + name = name.replace("thinker.", "") # Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): @@ -5016,6 +5138,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel): + model_arch = gguf.MODEL_ARCH.QWEN3VLMOE + + def set_vocab(self): + super().set_vocab() + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_num_deepstack_layers(0) + + class _LinearAttentionVReorderBase(Qwen3NextModel): model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses """reorders V heads from grouped to tiled order for ggml broadcast diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b35c976e8f2..47b80b3625c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -783,6 +783,8 @@ class MODEL_TENSOR(IntEnum): A_ENC_EMBD_TO_LOGITS = auto() # lfm2 A_ENC_CONV1D = auto() A_ENC_CONV1D_NORM = auto() # gemma3n + A_ENC_CONV2D = auto() + A_ENC_CONV_OUT = auto() A_PRE_NORM = auto() A_POST_NORM = auto() A_ENC_LAYER_PRE_NORM = auto() # gemma3n @@ -1244,6 +1246,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", + MODEL_TENSOR.A_ENC_CONV2D: "a.conv2d.{bid}", + MODEL_TENSOR.A_ENC_CONV_OUT: "a.conv_out", MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm", MODEL_TENSOR.A_PRE_NORM: "a.pre_ln", MODEL_TENSOR.A_POST_NORM: "a.post_ln", @@ -1376,6 +1380,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_EMBD_NORM, MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS, MODEL_TENSOR.A_ENC_CONV1D, + MODEL_TENSOR.A_ENC_CONV2D, + MODEL_TENSOR.A_ENC_CONV_OUT, MODEL_TENSOR.A_ENC_CONV1D_NORM, MODEL_TENSOR.A_PRE_NORM, MODEL_TENSOR.A_POST_NORM, @@ -4020,6 +4026,7 @@ class VisionProjectorType: ULTRAVOX = "ultravox" INTERNVL = "internvl" QWEN2A = "qwen2a" # audio + QWEN3A = "qwen3a" # audio GLMA = "glma" # audio QWEN25O = "qwen2.5o" # omni VOXTRAL = "voxtral" diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index df70577dbc2..0b0f427f8dd 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1788,6 +1788,14 @@ class TensorNameMap: "model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n ), + MODEL_TENSOR.A_ENC_CONV2D: ( + "audio_tower.conv2d{bid}", # qwen3omni + ), + + MODEL_TENSOR.A_ENC_CONV_OUT: ( + "audio_tower.conv_out", # qwen3omni + ), + MODEL_TENSOR.A_PRE_NORM: (), MODEL_TENSOR.A_POST_NORM: ( @@ -1912,7 +1920,8 @@ class TensorNameMap: MODEL_TENSOR.A_MMPROJ: ( "audio.multi_modal_projector.linear_{bid}", # ultravox - "audio_adapter.model.{bid}" # lfm2 + "audio_adapter.model.{bid}", # lfm2 + "audio_tower.proj{bid}", # qwen3omni ), MODEL_TENSOR.A_MMPROJ_FC: ( diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index b3cf15f9ecc..e1ef12403cf 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -29,6 +29,7 @@ add_library(mtmd models/pixtral.cpp models/qwen2vl.cpp models/qwen3vl.cpp + models/qwen3a.cpp models/siglip.cpp models/whisper-enc.cpp models/deepseekocr.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 011d76bcf68..71465d0a16d 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -132,6 +132,8 @@ // ultravox #define TN_CONV1D "a.conv1d.%d.%s" +#define TN_CONV2D "a.conv2d.%d.%s" +#define TN_CONV_OUT "a.conv_out.%s" #define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s" #define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer #define TN_MM_NORM_PRE "mm.a.norm_pre.%s" @@ -241,6 +243,7 @@ enum projector_type { PROJECTOR_TYPE_INTERNVL, PROJECTOR_TYPE_LLAMA4, PROJECTOR_TYPE_QWEN2A, + PROJECTOR_TYPE_QWEN3A, PROJECTOR_TYPE_GLMA, PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, @@ -279,6 +282,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_INTERNVL, "internvl"}, { PROJECTOR_TYPE_LLAMA4, "llama4"}, { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, + { PROJECTOR_TYPE_QWEN3A, "qwen3a"}, { PROJECTOR_TYPE_GLMA, "glma"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index a73e9ba38b2..295f672eecd 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -401,10 +401,20 @@ struct clip_model { ggml_tensor * conv1d_1_b = nullptr; ggml_tensor * conv1d_2_w = nullptr; ggml_tensor * conv1d_2_b = nullptr; + ggml_tensor * conv_out_w = nullptr; + ggml_tensor * conv_out_b = nullptr; ggml_tensor * mm_norm_pre_w = nullptr; ggml_tensor * mm_norm_pre_b = nullptr; ggml_tensor * mm_norm_mid_w = nullptr; + // qwen3a + ggml_tensor * conv2d_1_w = nullptr; + ggml_tensor * conv2d_1_b = nullptr; + ggml_tensor * conv2d_2_w = nullptr; + ggml_tensor * conv2d_2_b = nullptr; + ggml_tensor * conv2d_3_w = nullptr; + ggml_tensor * conv2d_3_b = nullptr; + // cogvlm ggml_tensor * mm_post_fc_norm_w = nullptr; ggml_tensor * mm_post_fc_norm_b = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 2947fcf9a3d..12a298e9a8b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -882,6 +882,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_QWEN3A: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_YOUTUVL: { builder = std::make_unique(ctx, img); @@ -1315,6 +1319,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_MUSIC_FLAMINGO: @@ -1871,6 +1876,20 @@ struct clip_model_loader { model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight")); model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias")); } break; + case PROJECTOR_TYPE_QWEN3A: + { + model.conv2d_1_w = get_tensor(string_format(TN_CONV2D, 1, "weight")); + model.conv2d_1_b = get_tensor(string_format(TN_CONV2D, 1, "bias")); + model.conv2d_2_w = get_tensor(string_format(TN_CONV2D, 2, "weight")); + model.conv2d_2_b = get_tensor(string_format(TN_CONV2D, 2, "bias")); + model.conv2d_3_w = get_tensor(string_format(TN_CONV2D, 3, "weight")); + model.conv2d_3_b = get_tensor(string_format(TN_CONV2D, 3, "bias")); + model.conv_out_w = get_tensor(string_format(TN_CONV_OUT, "weight")); // no bias + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); + } break; case PROJECTOR_TYPE_VOXTRAL: { model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); @@ -2648,6 +2667,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches /= 2; } } break; + case PROJECTOR_TYPE_QWEN3A: + { + // 3x stride-2 conv2d: each step is floor((n-1)/2)+1 + int n = img->nx; + n = (n - 1) / 2 + 1; + n = (n - 1) / 2 + 1; + n = (n - 1) / 2 + 1; + n_patches = n; + } break; case PROJECTOR_TYPE_GLMA: { n_patches = img->nx; @@ -3061,6 +3089,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_NEMOTRON_V2_VL: case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_LFM2: @@ -3231,8 +3260,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_QWEN2A: return ctx->model.mm_fc_w->ne[1]; - case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_QWEN3A: return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: case PROJECTOR_TYPE_PADDLEOCR: @@ -3280,6 +3310,7 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { switch (ctx->proj_type()) { case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_MUSIC_FLAMINGO: diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 5705d7f21e1..2a972c5165c 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -125,6 +125,11 @@ struct clip_graph_mobilenetv5 : clip_graph { const mobilenetv5_block & block); }; +struct clip_graph_qwen3a : clip_graph { + clip_graph_qwen3a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_kimik25 : clip_graph { clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/models/qwen3a.cpp b/tools/mtmd/models/qwen3a.cpp new file mode 100644 index 00000000000..1384e5155ee --- /dev/null +++ b/tools/mtmd/models/qwen3a.cpp @@ -0,0 +1,68 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen3a::build() { + ggml_tensor * inp = build_inp_raw(1); + + // conv2d block + // TODO: do we need to split by chunks of n_window each like on transformers impl? + { + inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1); + inp = ggml_add(ctx0, inp, model.conv2d_1_b); + inp = ggml_gelu_erf(ctx0, inp); + + inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1); + inp = ggml_add(ctx0, inp, model.conv2d_2_b); + inp = ggml_gelu_erf(ctx0, inp); + + inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1); + inp = ggml_add(ctx0, inp, model.conv2d_3_b); + inp = ggml_gelu_erf(ctx0, inp); + + // inp [n_pos, n_mels/8, channels, 1] (W, H, C, N) + cb(inp, "after_conv_blocks", -1); + + const int64_t n_pos_after_conv = inp->ne[0]; + const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16 + + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1)); + inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680] + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos] + + // project to n_embd + inp = ggml_mul_mat(ctx0, model.conv_out_w, inp); + if (model.conv_out_b) { + inp = ggml_add(ctx0, inp, model.conv_out_b); + } + cb(inp, "after_conv_out", -1); + } + + auto n_pos = inp->ne[1]; + + ggml_tensor * pos_embd_selected = ggml_view_2d( + ctx0, model.position_embeddings, + model.position_embeddings->ne[0], n_pos, + model.position_embeddings->nb[1], 0 + ); + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + pos_embd_selected, + nullptr); + + cb(cur, "after_transformer", -1); + + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + + cb(cur, "projected", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 9c400ce1045..8219591ed2e 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -418,6 +418,7 @@ struct mtmd_context { // set preprocessor switch (proj) { case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_QWEN25O: { // <|audio_bos|> ... (embeddings) ... <|audio_eos|> @@ -981,6 +982,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) { } bool mtmd_decode_use_mrope(mtmd_context * ctx) { + if (ctx->ctx_v == nullptr && ctx->proj_type_a() == PROJECTOR_TYPE_QWEN3A) { + // qwen3-asr + return true; + } switch (ctx->proj_type_v()) { case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: