Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2643,7 +2643,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_file_type(self.ftype)


@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2

Expand All @@ -2667,8 +2667,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
if "language_model." in name:
name = name.replace("language_model.", "") # for InternVL
if name.startswith("mlp") or name.startswith("vision_model"):
# skip visual tensors
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
or name.startswith("vision_model") or name.startswith("audio_tower"):
# skip vision and audio tensors
return []
yield from super().modify_tensors(data_torch, name, bid)

Expand Down Expand Up @@ -5993,11 +5994,11 @@ class UltravoxModel(TextModel):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument")
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")


@ModelBase.register("UltravoxModel")
class UltravoxAudioModel(MmprojModel):
@ModelBase.register("Qwen2AudioForConditionalGeneration")
class WhisperEncoderModel(MmprojModel):
has_vision_encoder = False # no vision encoder
has_audio_encoder = True

Expand All @@ -6009,10 +6010,9 @@ def __init__(self, *args, **kwargs):

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])

def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, new_name, n_dims # unused
Expand All @@ -6023,6 +6023,10 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

if name.startswith("language_model."):
# skip language model tensors
return []

# prevent clash naming with vision tensors
if name.startswith("multi_modal_projector"):
name = "audio." + name
Expand All @@ -6033,6 +6037,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

return [(self.map_tensor_name(name), data_torch)]


@ModelBase.register("UltravoxModel")
class UltravoxWhisperEncoderModel(WhisperEncoderModel):
has_vision_encoder = False # no vision encoder
has_audio_encoder = True

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])

###### CONVERSION LOGIC ######


Expand Down
4 changes: 4 additions & 0 deletions docs/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,8 @@ NOTE: some models may require large context window, for example: `-c 8192`
# Ultravox 0.5
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF

# Qwen2-Audio and SeaLLM-Audio
# note: no pre-quantized GGUF this model, as they have very poor result
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
```
4 changes: 4 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ class MODEL_TENSOR(IntEnum):
A_ENC_FFN_GATE = auto()
A_ENC_FFN_DOWN = auto()
A_MMPROJ = auto()
A_MMPROJ_FC = auto()
A_MM_NORM_PRE = auto()
A_MM_NORM_MID = auto()

Expand Down Expand Up @@ -825,6 +826,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
}
Expand Down Expand Up @@ -885,6 +887,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.A_ENC_FFN_GATE,
MODEL_TENSOR.A_ENC_FFN_DOWN,
MODEL_TENSOR.A_MMPROJ,
MODEL_TENSOR.A_MMPROJ_FC,
MODEL_TENSOR.A_MM_NORM_PRE,
MODEL_TENSOR.A_MM_NORM_MID,
],
Expand Down Expand Up @@ -2256,6 +2259,7 @@ class VisionProjectorType:
QWEN25VL = "qwen2.5vl_merger"
ULTRAVOX = "ultravox"
INTERNVL = "internvl"
QWEN2A = "qwen2a" # audio


# Items here are (block size, type size)
Expand Down
4 changes: 4 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1165,6 +1165,10 @@ class TensorNameMap:
"audio.multi_modal_projector.linear_{bid}", # ultravox
),

MODEL_TENSOR.A_MMPROJ_FC: (
"audio.multi_modal_projector.linear", # qwen2audio
),

MODEL_TENSOR.A_MM_NORM_PRE: (
"audio.multi_modal_projector.ln_pre", # ultravox
),
Expand Down
3 changes: 3 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
// ultravox
#define TN_CONV1D "a.conv1d.%d.%s"
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"

Expand All @@ -128,6 +129,7 @@ enum projector_type {
PROJECTOR_TYPE_ULTRAVOX,
PROJECTOR_TYPE_INTERNVL,
PROJECTOR_TYPE_LLAMA4,
PROJECTOR_TYPE_QWEN2A,
PROJECTOR_TYPE_UNKNOWN,
};

Expand All @@ -145,6 +147,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down
120 changes: 83 additions & 37 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,9 @@ struct clip_vision_model {
ggml_tensor * post_ln_w;
ggml_tensor * post_ln_b;

ggml_tensor * projection;
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
ggml_tensor * mm_fc_w;
ggml_tensor * mm_fc_b;

// LLaVA projection
ggml_tensor * mm_input_norm_w = nullptr;
Expand Down Expand Up @@ -1471,48 +1473,58 @@ struct clip_graph {

cb(cur, "after_transformer", -1);

// StackAudioFrames
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
{
int64_t stride = n_embd * hparams.proj_stack_factor;
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
int64_t pad = padded_len - ggml_nelements(cur);
if (pad > 0) {
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
// StackAudioFrames
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
{
int64_t stride = n_embd * hparams.proj_stack_factor;
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
int64_t pad = padded_len - ggml_nelements(cur);
if (pad > 0) {
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
}
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
ggml_row_size(cur->type, stride), 0);
}
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
ggml_row_size(cur->type, stride), 0);
}

cb(cur, "after_stacked", -1);
cb(cur, "after_stacked", -1);

// UltravoxProjector
{
// pre-norm
cur = ggml_rms_norm(ctx0, cur, 1e-6);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
// UltravoxProjector
{
// pre-norm
cur = ggml_rms_norm(ctx0, cur, 1e-6);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);

// ffn in
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
// ffn in
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);

// swiglu
{
int64_t split_point = cur->ne[0] / 2;
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
// swiglu
{
int64_t split_point = cur->ne[0] / 2;
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));

// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
x1 = ggml_silu(ctx0, x1);
cur = ggml_mul(ctx0, x0, x1);
}

// mid-norm
cur = ggml_rms_norm(ctx0, cur, 1e-6);
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);

// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
x1 = ggml_silu(ctx0, x1);
cur = ggml_mul(ctx0, x0, x1);
// ffn out
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
}

// mid-norm
cur = ggml_rms_norm(ctx0, cur, 1e-6);
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
// projector
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
cur = ggml_add(ctx0, cur, model.mm_fc_b);

// ffn out
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
} else {
GGML_ABORT("%s: unknown projector type", __func__);
}

cb(cur, "projected", -1);
Expand Down Expand Up @@ -1655,6 +1667,17 @@ struct clip_graph {
inpL = cur;
}

// TODO @ngxson : find a way to move this outside
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
ggml_tensor * cur = inpL;
cur = ggml_transpose(ctx0, cur);
cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to prefer ggml_cast here over ggml_cont?

Copy link
Collaborator Author

@ngxson ngxson May 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had some problem with ggml_compute_forward_pool_1d and when looking into the source code, I mistakenly thought that it only supports F32. Changed to ggml_cont in e53a0dc

Note: I got this error without cast or cont, maybe we should assert the input to be contiguous:

==83831==ERROR: AddressSanitizer: BUS on unknown address (pc 0x000104b899ec bp 0x00016bbc1010 sp 0x00016bbc0fd0 T0)
==83831==The signal is caused by a WRITE memory access.
==83831==Hint: this fault was caused by a dereference of a high value address (see register values below).  Disassemble the provided pc to learn which register was used.
    #0 0x104b899ec in ggml_compute_forward_pool_1d ops.cpp:6395
    #1 0x104ae9920 in ggml_graph_compute_thread ggml-cpu.c:2847
    #2 0x104ae7e78 in ggml_graph_compute ggml-cpu.c:3138
    #3 0x104aef350 in ggml_backend_cpu_graph_compute(ggml_backend*, ggml_cgraph*) ggml-cpu.cpp:172
    #4 0x10542513c in ggml_backend_sched_graph_compute_async ggml-backend.cpp:1594
    #5 0x105424500 in ggml_backend_sched_graph_compute ggml-backend.cpp:1578

cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
cur = ggml_transpose(ctx0, cur);
cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
inpL = cur;
}

// post-layernorm
if (model.post_ln_w) {
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
Expand Down Expand Up @@ -1952,6 +1975,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
res = graph.build_llama4();
} break;
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
{
res = graph.build_whisper_enc();
} break;
Expand Down Expand Up @@ -2186,8 +2210,10 @@ struct clip_model_loader {
};
} break;
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
{
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
bool require_stack = ctx_clip.proj_type == PROJECTOR_TYPE_ULTRAVOX;
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
if (hparams.n_mel_bins != 128) {
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
}
Expand Down Expand Up @@ -2266,7 +2292,7 @@ struct clip_model_loader {
return cur;
};

auto & vision_model = ctx_clip.vision_model;
auto & vision_model = ctx_clip.vision_model; // TODO: rename this to just "model"

vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);

Expand Down Expand Up @@ -2463,6 +2489,15 @@ struct clip_model_loader {
vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
} break;
case PROJECTOR_TYPE_QWEN2A:
{
vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
vision_model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
vision_model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
} break;
case PROJECTOR_TYPE_INTERNVL:
{
vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
Expand Down Expand Up @@ -3450,6 +3485,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
n_patches = n_len / proj_stack_factor / 2;
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this should become a switch (no need to change in this PR)

// divide by 2 because of whisper
// another divide by 2 because of nn.AvgPool1d(2, stride=2)
n_patches = img->nx / 4;
}

return n_patches;
Expand Down Expand Up @@ -3850,6 +3889,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_ULTRAVOX:
{
// do nothing
Expand Down Expand Up @@ -3910,7 +3950,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int n_tokens_out = embeddings->ne[1];
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
if (n_tokens_out != expected_n_tokens_out) {
LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
GGML_ABORT("Invalid number of output tokens");
}

Expand Down Expand Up @@ -3955,6 +3995,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->vision_model.mm_3_w->ne[1];
case PROJECTOR_TYPE_LLAMA4:
return ctx->vision_model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_QWEN2A:
return ctx->vision_model.mm_fc_w->ne[1];
default:
GGML_ABORT("Unknown projector type");
}
Expand Down Expand Up @@ -3991,6 +4033,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.has_audio;
}

bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
return ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX || ctx->proj_type == PROJECTOR_TYPE_QWEN2A;
}

bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);
Expand Down
3 changes: 3 additions & 0 deletions tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <stddef.h>
#include <stdint.h>

// !!! Internal header, to be used by mtmd only !!!

struct clip_ctx;

struct clip_image_size {
Expand Down Expand Up @@ -99,3 +101,4 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel

bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
Loading
Loading