@@ -4038,6 +4038,59 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
40384038 yield from super().modify_tensors(data_torch, name, bid)
40394039
40404040
4041+
4042+ @ModelBase.register("Qwen3ASRForConditionalGeneration")
4043+ class Qwen3ASRAudioModel(MmprojModel):
4044+ has_vision_encoder = False
4045+ has_audio_encoder = True
4046+
4047+ def __init__(self, *args, **kwargs):
4048+ super().__init__(*args, **kwargs)
4049+ assert self.hparams_audio is not None
4050+ self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
4051+ self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
4052+ self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
4053+
4054+ def get_audio_config(self) -> dict[str, Any] | None:
4055+ return self.global_config.get("thinker_config", {}).get("audio_config")
4056+
4057+ def set_gguf_parameters(self):
4058+ super().set_gguf_parameters()
4059+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3A)
4060+ assert self.hparams_audio is not None
4061+ self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
4062+ self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
4063+
4064+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
4065+ # SinusoidsPositionEmbedding (same as Qwen2.5 Omni)
4066+ assert self.hparams_audio is not None
4067+ max_timescale = 10000
4068+ length = self.hparams_audio.get("max_source_positions", 1500)
4069+ channels = self.hparams_audio["hidden_size"]
4070+ log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
4071+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
4072+ scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
4073+ pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
4074+ yield ("audio_tower.embed_positions.weight", pos_embd)
4075+
4076+ def tensor_force_quant(self, name, new_name, bid, n_dims):
4077+ if ".conv" in name and ".weight" in name:
4078+ return gguf.GGMLQuantizationType.F16
4079+ return super().tensor_force_quant(name, new_name, bid, n_dims)
4080+
4081+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4082+ if name.startswith("thinker."):
4083+ name = name.replace("thinker.", "")
4084+
4085+ if name.startswith("audio_tower."):
4086+ # conv2d bias needs unsqueeze for ggml conv2d
4087+ if "conv2d" in name and name.endswith(".bias"):
4088+ data_torch = data_torch.unsqueeze(-1).unsqueeze(-1)
4089+ return [(self.map_tensor_name(name), data_torch)]
4090+
4091+ return [] # skip text model tensors
4092+
4093+
40414094@ModelBase.register("Qwen2_5OmniModel")
40424095class Qwen25OmniModel(Qwen2VLVisionModel):
40434096 has_vision_encoder = True
@@ -4698,6 +4751,31 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
46984751 yield from super().modify_tensors(data_torch, name, bid)
46994752
47004753
4754+ @ModelBase.register("Qwen3ASRForConditionalGeneration")
4755+ class Qwen3ASRTextModel(Qwen3Model):
4756+ model_arch = gguf.MODEL_ARCH.QWEN3
4757+
4758+ def set_gguf_parameters(self):
4759+ # Override to get text_config from thinker_config
4760+ if "thinker_config" in self.hparams:
4761+ text_config = self.hparams["thinker_config"].get("text_config", {})
4762+ # Merge text_config into hparams so parent class can use them
4763+ for k, v in text_config.items():
4764+ if k not in self.hparams:
4765+ self.hparams[k] = v
4766+ super().set_gguf_parameters()
4767+
4768+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4769+ # Skip audio tensors - they go in the mmproj file
4770+ if "audio_tower" in name:
4771+ return []
4772+
4773+ # Strip thinker prefix
4774+ name = name.replace("thinker.", "")
4775+
4776+ yield from super().modify_tensors(data_torch, name, bid)
4777+
4778+
47014779@ModelBase.register("Qwen3VLForConditionalGeneration")
47024780class Qwen3VLTextModel(Qwen3Model):
47034781 model_arch = gguf.MODEL_ARCH.QWEN3VL
0 commit comments