Skip to content

Commit dc33046

Browse files
author
Quentin Fuxa
committed
mtmd: add Qwen3-ASR audio support (conv2d encoder + projector)
Add support for Qwen3-ASR-1.7B model (Qwen3ASRForConditionalGeneration): - New QWEN3A projector type for audio-only ASR models - Conv2d encoder (3 layers, stride=2 each, 8x time downsampling) - Whisper-like transformer encoder (24 layers) - MLP projector: Linear(1024,1024) -> GELU -> Linear(1024,2048) - Conversion tested: both mmproj and decoder GGUF files work - Basic inference tested: model loads, encodes audio, generates output Based on PR ggml-org#19441 by ngxson (WIP qwen3 audio), adapted for Qwen3-ASR-only architecture (no vision, no deepstack). Our attention extraction API (llama_set_attn_heads/llama_get_attn_ith) is untouched.
1 parent e8734ac commit dc33046

File tree

10 files changed

+231
-1
lines changed

10 files changed

+231
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4038,6 +4038,59 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
40384038
yield from super().modify_tensors(data_torch, name, bid)
40394039

40404040

4041+
4042+
@ModelBase.register("Qwen3ASRForConditionalGeneration")
4043+
class Qwen3ASRAudioModel(MmprojModel):
4044+
has_vision_encoder = False
4045+
has_audio_encoder = True
4046+
4047+
def __init__(self, *args, **kwargs):
4048+
super().__init__(*args, **kwargs)
4049+
assert self.hparams_audio is not None
4050+
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
4051+
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
4052+
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
4053+
4054+
def get_audio_config(self) -> dict[str, Any] | None:
4055+
return self.global_config.get("thinker_config", {}).get("audio_config")
4056+
4057+
def set_gguf_parameters(self):
4058+
super().set_gguf_parameters()
4059+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3A)
4060+
assert self.hparams_audio is not None
4061+
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
4062+
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
4063+
4064+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
4065+
# SinusoidsPositionEmbedding (same as Qwen2.5 Omni)
4066+
assert self.hparams_audio is not None
4067+
max_timescale = 10000
4068+
length = self.hparams_audio.get("max_source_positions", 1500)
4069+
channels = self.hparams_audio["hidden_size"]
4070+
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
4071+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
4072+
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
4073+
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
4074+
yield ("audio_tower.embed_positions.weight", pos_embd)
4075+
4076+
def tensor_force_quant(self, name, new_name, bid, n_dims):
4077+
if ".conv" in name and ".weight" in name:
4078+
return gguf.GGMLQuantizationType.F16
4079+
return super().tensor_force_quant(name, new_name, bid, n_dims)
4080+
4081+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4082+
if name.startswith("thinker."):
4083+
name = name.replace("thinker.", "")
4084+
4085+
if name.startswith("audio_tower."):
4086+
# conv2d bias needs unsqueeze for ggml conv2d
4087+
if "conv2d" in name and name.endswith(".bias"):
4088+
data_torch = data_torch.unsqueeze(-1).unsqueeze(-1)
4089+
return [(self.map_tensor_name(name), data_torch)]
4090+
4091+
return [] # skip text model tensors
4092+
4093+
40414094
@ModelBase.register("Qwen2_5OmniModel")
40424095
class Qwen25OmniModel(Qwen2VLVisionModel):
40434096
has_vision_encoder = True
@@ -4698,6 +4751,31 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
46984751
yield from super().modify_tensors(data_torch, name, bid)
46994752

47004753

4754+
@ModelBase.register("Qwen3ASRForConditionalGeneration")
4755+
class Qwen3ASRTextModel(Qwen3Model):
4756+
model_arch = gguf.MODEL_ARCH.QWEN3
4757+
4758+
def set_gguf_parameters(self):
4759+
# Override to get text_config from thinker_config
4760+
if "thinker_config" in self.hparams:
4761+
text_config = self.hparams["thinker_config"].get("text_config", {})
4762+
# Merge text_config into hparams so parent class can use them
4763+
for k, v in text_config.items():
4764+
if k not in self.hparams:
4765+
self.hparams[k] = v
4766+
super().set_gguf_parameters()
4767+
4768+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4769+
# Skip audio tensors - they go in the mmproj file
4770+
if "audio_tower" in name:
4771+
return []
4772+
4773+
# Strip thinker prefix
4774+
name = name.replace("thinker.", "")
4775+
4776+
yield from super().modify_tensors(data_torch, name, bid)
4777+
4778+
47014779
@ModelBase.register("Qwen3VLForConditionalGeneration")
47024780
class Qwen3VLTextModel(Qwen3Model):
47034781
model_arch = gguf.MODEL_ARCH.QWEN3VL

gguf-py/gguf/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,8 @@ class MODEL_TENSOR(IntEnum):
752752
A_ENC_EMBD_TO_LOGITS = auto() # lfm2
753753
A_ENC_CONV1D = auto()
754754
A_ENC_CONV1D_NORM = auto() # gemma3n
755+
A_ENC_CONV2D = auto() # qwen3asr
756+
A_ENC_CONV_OUT = auto() # qwen3asr
755757
A_PRE_NORM = auto()
756758
A_POST_NORM = auto()
757759
A_ENC_LAYER_PRE_NORM = auto() # gemma3n
@@ -1193,6 +1195,8 @@ class MODEL_TENSOR(IntEnum):
11931195
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
11941196
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
11951197
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
1198+
MODEL_TENSOR.A_ENC_CONV2D: "a.conv2d.{bid}",
1199+
MODEL_TENSOR.A_ENC_CONV_OUT: "a.conv_out",
11961200
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
11971201
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
11981202
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
@@ -1310,6 +1314,8 @@ class MODEL_TENSOR(IntEnum):
13101314
MODEL_TENSOR.A_ENC_EMBD_NORM,
13111315
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
13121316
MODEL_TENSOR.A_ENC_CONV1D,
1317+
MODEL_TENSOR.A_ENC_CONV2D,
1318+
MODEL_TENSOR.A_ENC_CONV_OUT,
13131319
MODEL_TENSOR.A_ENC_CONV1D_NORM,
13141320
MODEL_TENSOR.A_PRE_NORM,
13151321
MODEL_TENSOR.A_POST_NORM,
@@ -3878,6 +3884,7 @@ class VisionProjectorType:
38783884
ULTRAVOX = "ultravox"
38793885
INTERNVL = "internvl"
38803886
QWEN2A = "qwen2a" # audio
3887+
QWEN3A = "qwen3a" # audio
38813888
GLMA = "glma" # audio
38823889
QWEN25O = "qwen2.5o" # omni
38833890
VOXTRAL = "voxtral"

gguf-py/gguf/tensor_mapping.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1702,6 +1702,14 @@ class TensorNameMap:
17021702
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
17031703
),
17041704

1705+
MODEL_TENSOR.A_ENC_CONV2D: (
1706+
"audio_tower.conv2d{bid}", # qwen3asr
1707+
),
1708+
1709+
MODEL_TENSOR.A_ENC_CONV_OUT: (
1710+
"audio_tower.conv_out", # qwen3asr
1711+
),
1712+
17051713
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
17061714
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
17071715
),
@@ -1830,7 +1838,8 @@ class TensorNameMap:
18301838

18311839
MODEL_TENSOR.A_MMPROJ: (
18321840
"audio.multi_modal_projector.linear_{bid}", # ultravox
1833-
"audio_adapter.model.{bid}" # lfm2
1841+
"audio_adapter.model.{bid}", # lfm2
1842+
"audio_tower.proj{bid}", # qwen3asr
18341843
),
18351844

18361845
MODEL_TENSOR.A_MMPROJ_FC: (

tools/mtmd/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ add_library(mtmd
2828
models/pixtral.cpp
2929
models/qwen2vl.cpp
3030
models/qwen3vl.cpp
31+
models/qwen3a.cpp
3132
models/siglip.cpp
3233
models/whisper-enc.cpp
3334
models/mobilenetv5.cpp

tools/mtmd/clip-impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@
128128

129129
// ultravox
130130
#define TN_CONV1D "a.conv1d.%d.%s"
131+
#define TN_CONV2D "a.conv2d.%d.%s"
132+
#define TN_CONV_OUT "a.conv_out.%s"
131133
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
132134
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
133135
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
@@ -223,6 +225,7 @@ enum projector_type {
223225
PROJECTOR_TYPE_INTERNVL,
224226
PROJECTOR_TYPE_LLAMA4,
225227
PROJECTOR_TYPE_QWEN2A,
228+
PROJECTOR_TYPE_QWEN3A,
226229
PROJECTOR_TYPE_GLMA,
227230
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
228231
PROJECTOR_TYPE_VOXTRAL,
@@ -259,6 +262,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
259262
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
260263
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
261264
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
265+
{ PROJECTOR_TYPE_QWEN3A, "qwen3a"},
262266
{ PROJECTOR_TYPE_GLMA, "glma"},
263267
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
264268
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},

tools/mtmd/clip-model.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,17 @@ struct clip_model {
356356
ggml_tensor * conv1d_1_b = nullptr;
357357
ggml_tensor * conv1d_2_w = nullptr;
358358
ggml_tensor * conv1d_2_b = nullptr;
359+
ggml_tensor * conv_out_w = nullptr;
360+
ggml_tensor * conv_out_b = nullptr;
359361
ggml_tensor * mm_norm_pre_w = nullptr;
362+
363+
// qwen3a (conv2d audio encoder)
364+
ggml_tensor * conv2d_1_w = nullptr;
365+
ggml_tensor * conv2d_1_b = nullptr;
366+
ggml_tensor * conv2d_2_w = nullptr;
367+
ggml_tensor * conv2d_2_b = nullptr;
368+
ggml_tensor * conv2d_3_w = nullptr;
369+
ggml_tensor * conv2d_3_b = nullptr;
360370
ggml_tensor * mm_norm_pre_b = nullptr;
361371
ggml_tensor * mm_norm_mid_w = nullptr;
362372

tools/mtmd/clip.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
837837
{
838838
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
839839
} break;
840+
case PROJECTOR_TYPE_QWEN3A:
841+
{
842+
builder = std::make_unique<clip_graph_qwen3a>(ctx, img);
843+
} break;
840844
case PROJECTOR_TYPE_KIMIVL:
841845
{
842846
builder = std::make_unique<clip_graph_kimivl>(ctx, img);
@@ -1242,6 +1246,7 @@ struct clip_model_loader {
12421246
} break;
12431247
case PROJECTOR_TYPE_ULTRAVOX:
12441248
case PROJECTOR_TYPE_QWEN2A:
1249+
case PROJECTOR_TYPE_QWEN3A:
12451250
case PROJECTOR_TYPE_GLMA:
12461251
case PROJECTOR_TYPE_VOXTRAL:
12471252
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
@@ -1767,6 +1772,20 @@ struct clip_model_loader {
17671772
model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
17681773
model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
17691774
} break;
1775+
case PROJECTOR_TYPE_QWEN3A:
1776+
{
1777+
model.conv2d_1_w = get_tensor(string_format(TN_CONV2D, 1, "weight"));
1778+
model.conv2d_1_b = get_tensor(string_format(TN_CONV2D, 1, "bias"));
1779+
model.conv2d_2_w = get_tensor(string_format(TN_CONV2D, 2, "weight"));
1780+
model.conv2d_2_b = get_tensor(string_format(TN_CONV2D, 2, "bias"));
1781+
model.conv2d_3_w = get_tensor(string_format(TN_CONV2D, 3, "weight"));
1782+
model.conv2d_3_b = get_tensor(string_format(TN_CONV2D, 3, "bias"));
1783+
model.conv_out_w = get_tensor(string_format(TN_CONV_OUT, "weight"));
1784+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1785+
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
1786+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
1787+
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
1788+
} break;
17701789
case PROJECTOR_TYPE_VOXTRAL:
17711790
{
17721791
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@@ -3501,6 +3520,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
35013520
n_patches /= 2;
35023521
}
35033522
} break;
3523+
case PROJECTOR_TYPE_QWEN3A:
3524+
{
3525+
// Qwen3-ASR: 3 conv2d layers each with stride=2 (total 8x downsampling in time)
3526+
// n_mel_bins=128 -> after 3 conv2d: ceil((128+2*1-3)/2+1) = 64, then 32, then 16
3527+
// Time dimension: n_frames/8
3528+
n_patches = img->nx / 8;
3529+
} break;
35043530
case PROJECTOR_TYPE_GLMA:
35053531
{
35063532
n_patches = img->nx;
@@ -3878,6 +3904,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
38783904
case PROJECTOR_TYPE_INTERNVL:
38793905
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
38803906
case PROJECTOR_TYPE_QWEN2A:
3907+
case PROJECTOR_TYPE_QWEN3A:
38813908
case PROJECTOR_TYPE_GLMA:
38823909
case PROJECTOR_TYPE_ULTRAVOX:
38833910
case PROJECTOR_TYPE_LFM2:
@@ -4046,6 +4073,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
40464073
return ctx->model.mm_model_proj->ne[1];
40474074
case PROJECTOR_TYPE_QWEN2A:
40484075
return ctx->model.mm_fc_w->ne[1];
4076+
case PROJECTOR_TYPE_QWEN3A:
4077+
return ctx->model.mm_2_w->ne[1];
40494078
case PROJECTOR_TYPE_GLMA:
40504079
return ctx->model.mm_2_w->ne[1];
40514080
case PROJECTOR_TYPE_LFM2:
@@ -4093,6 +4122,7 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
40934122
switch (ctx->proj_type()) {
40944123
case PROJECTOR_TYPE_ULTRAVOX:
40954124
case PROJECTOR_TYPE_QWEN2A:
4125+
case PROJECTOR_TYPE_QWEN3A:
40964126
case PROJECTOR_TYPE_GLMA:
40974127
case PROJECTOR_TYPE_VOXTRAL:
40984128
case PROJECTOR_TYPE_MUSIC_FLAMINGO:

tools/mtmd/models/models.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,8 @@ struct clip_graph_kimik25 : clip_graph {
126126

127127
ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
128128
};
129+
130+
struct clip_graph_qwen3a : clip_graph {
131+
clip_graph_qwen3a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
132+
ggml_cgraph * build() override;
133+
};

tools/mtmd/models/qwen3a.cpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#include "models.h"
2+
3+
ggml_cgraph * clip_graph_qwen3a::build() {
4+
// Qwen3-ASR audio encoder
5+
// Input: mel spectrogram [n_mel_bins, n_frames] = [128, n_frames]
6+
// Conv2d block: 3 layers stride=2 each -> 8x time downsampling
7+
// Transformer: whisper-like encoder
8+
// Projector: Linear(d_model, d_model) -> GELU -> Linear(d_model, output_dim)
9+
10+
ggml_tensor * inp = build_inp_raw(1);
11+
12+
// conv2d block
13+
{
14+
// Conv2d(1, 480, 3, stride=2, padding=1) + GELU
15+
inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1);
16+
inp = ggml_add(ctx0, inp, model.conv2d_1_b);
17+
inp = ggml_gelu_erf(ctx0, inp);
18+
19+
// Conv2d(480, 480, 3, stride=2, padding=1) + GELU
20+
inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1);
21+
inp = ggml_add(ctx0, inp, model.conv2d_2_b);
22+
inp = ggml_gelu_erf(ctx0, inp);
23+
24+
// Conv2d(480, 480, 3, stride=2, padding=1) + GELU
25+
inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1);
26+
inp = ggml_add(ctx0, inp, model.conv2d_3_b);
27+
inp = ggml_gelu_erf(ctx0, inp);
28+
29+
// inp shape: [channels=480, freq_bins, time_out, 1]
30+
// We need to reshape to [channels * freq_bins, time_out] then project
31+
cb(inp, "after_conv_blocks", -1);
32+
33+
// Permute: [channels, freq, time] -> [time, freq, channels]
34+
inp = ggml_permute(ctx0, inp, 2, 1, 0, 3);
35+
inp = ggml_cont(ctx0, inp);
36+
37+
// Flatten freq * channels: [time, freq * channels]
38+
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
39+
40+
// Linear projection to d_model (conv_out: no bias)
41+
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
42+
if (model.conv_out_b) {
43+
inp = ggml_add(ctx0, inp, model.conv_out_b);
44+
}
45+
cb(inp, "after_conv_out", -1);
46+
}
47+
48+
auto n_pos = inp->ne[1];
49+
50+
// Add positional embeddings
51+
ggml_tensor * pos_embd_selected = ggml_view_2d(
52+
ctx0, model.position_embeddings,
53+
model.position_embeddings->ne[0], n_pos,
54+
model.position_embeddings->nb[1], 0
55+
);
56+
57+
// Whisper-like transformer encoder
58+
ggml_tensor * cur = build_vit(
59+
inp, n_pos,
60+
NORM_TYPE_NORMAL,
61+
hparams.ffn_op,
62+
pos_embd_selected,
63+
nullptr);
64+
65+
cb(cur, "after_transformer", -1);
66+
67+
// Projector: proj1 -> GELU -> proj2
68+
cur = build_ffn(cur,
69+
model.mm_1_w, model.mm_1_b,
70+
nullptr, nullptr,
71+
model.mm_2_w, model.mm_2_b,
72+
FFN_GELU_ERF,
73+
-1);
74+
75+
cb(cur, "projected", -1);
76+
77+
ggml_build_forward_expand(gf, cur);
78+
79+
return gf;
80+
}

tools/mtmd/mtmd.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ struct mtmd_context {
342342
// set preprocessor
343343
switch (proj) {
344344
case PROJECTOR_TYPE_QWEN2A:
345+
case PROJECTOR_TYPE_QWEN3A:
345346
case PROJECTOR_TYPE_QWEN25O:
346347
case PROJECTOR_TYPE_ULTRAVOX:
347348
case PROJECTOR_TYPE_VOXTRAL:
@@ -365,6 +366,11 @@ struct mtmd_context {
365366
aud_beg = "<|audio_bos|>";
366367
aud_end = "<|audio_eos|>";
367368

369+
} else if (proj == PROJECTOR_TYPE_QWEN3A) {
370+
// <|audio_start|> ... (embeddings) ... <|audio_end|>
371+
aud_beg = "<|audio_start|>";
372+
aud_end = "<|audio_end|>";
373+
368374
} else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
369375
// [BEGIN_AUDIO] ... (embeddings) ...
370376
aud_beg = "[BEGIN_AUDIO]";

0 commit comments

Comments
 (0)