Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4fa0c27
convert ok, load ok
ngxson May 4, 2025
8b73116
warmup ok
ngxson May 4, 2025
4ac7940
test
ngxson May 4, 2025
4282465
still does not work?
ngxson May 4, 2025
45cdb7f
fix padding
ngxson May 4, 2025
f3605b9
temporary give up
ngxson May 4, 2025
1804fa2
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 18, 2025
bc708b4
fix merge conflict
ngxson May 18, 2025
de20afd
build_ultravox()
ngxson May 18, 2025
bbe4940
rm test
ngxson May 19, 2025
4d44460
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 19, 2025
8d7d75a
fix merge conflict
ngxson May 19, 2025
dce799d
add necessary mtmd APIs
ngxson May 19, 2025
f151854
first working version (only 4s of audio)
ngxson May 19, 2025
9a0dcb6
will this monster compile?
ngxson May 19, 2025
1a90395
fix compile
ngxson May 19, 2025
4a8c092
please compile
ngxson May 19, 2025
6f23ad1
fPIC
ngxson May 19, 2025
cf38b47
fix windows
ngxson May 19, 2025
cf4f5d2
various fixes
ngxson May 19, 2025
3bbb26b
clean up audio_helpers
ngxson May 20, 2025
3ce96d7
fix conversion
ngxson May 20, 2025
cf9613f
add some debug stuff
ngxson May 20, 2025
23d0d7f
long audio input ok
ngxson May 21, 2025
7033aa1
adapt the api
ngxson May 21, 2025
e7c8a2e
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 21, 2025
111c820
add --audio arg
ngxson May 21, 2025
e6416b0
final touch UX
ngxson May 21, 2025
36a1abb
add miniaudio to readme
ngxson May 21, 2025
544f4f1
fix typo
ngxson May 21, 2025
7602ee4
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 22, 2025
9afb3af
refactor kv metadata
ngxson May 22, 2025
107790a
mtmd_default_marker()
ngxson May 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 52 additions & 3 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,15 +1129,22 @@ def __init__(self, *args, **kwargs):
# get n_embd of the text model
if "text_config" not in self.hparams:
self.hparams["text_config"] = {}
# TODO @ngxson : separate VisionModel and AudioModel
if "audio_config" not in self.hparams:
self.hparams["audio_config"] = {}
text_config = {**self.hparams, **self.hparams["text_config"]}
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
assert self.n_embd_text > 0, "n_embd not found in hparams"

if "vision_config" not in self.hparams:
raise ValueError("vision_config not found in hparams")
# move vision config to the top level, while preserving the original hparams in global_config
self.global_config = self.hparams
self.hparams = self.hparams["vision_config"]

if "vision_config" in self.hparams:
self.hparams = self.hparams["vision_config"]
elif "audio_config" in self.hparams:
self.hparams = self.hparams["audio_config"]
else:
raise ValueError("vision_config / audio_config not found in hparams")

self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
Expand Down Expand Up @@ -5959,6 +5966,48 @@ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
return data_torch


@ModelBase.register("UltravoxModel")
class UltravoxModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA # dummy
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument")


@ModelBase.register("UltravoxModel")
class UltravoxAudioModel(VisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hparams["image_size"] = self.hparams["num_mel_bins"]
self.hparams["patch_size"] = self.hparams["num_mel_bins"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the image_size and patch_size used in the audio encoder?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is unused, but I leave it here from my first draft version so the warmup works. But yeah I should remove this

self.hparams["hidden_size"] = self.hparams["d_model"]
self.hparams["intermediate_size"] = self.hparams["d_model"]
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
self.preprocessor_config["image_mean"] = [0, 0, 0]
self.preprocessor_config["image_std"] = [0, 0, 0]

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_AUDIO_ENC, True)
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.ULTRAVOX)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Projector.STACK_FACTOR, self.global_config["stack_factor"])

def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, new_name, n_dims # unused
if ".conv" in name and ".weight" in name:
return gguf.GGMLQuantizationType.F16
return False

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

if "conv1.bias" in name or "conv2.bias" in name:
# transpose conv1 and conv2 bias
data_torch = data_torch.unsqueeze(-1)

return [(self.map_tensor_name(name), data_torch)]

###### CONVERSION LOGIC ######


Expand Down
54 changes: 54 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ class Adapter:
LORA_ALPHA = "adapter.lora.alpha"

class ClipVision:
HAS_AUDIO_ENC = "clip.has_audio_encoder"
PROJECTOR_TYPE = "clip.projector_type"
HAS_VISION_ENCODER = "clip.has_vision_encoder"
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
Expand All @@ -242,6 +243,7 @@ class Attention:

class Projector:
SCALE_FACTOR = "clip.vision.projector.scale_factor"
STACK_FACTOR = "clip.audio.projector.stack_factor"

#
# recommended mapping of model tensor names for storage in gguf
Expand Down Expand Up @@ -514,6 +516,23 @@ class MODEL_TENSOR(IntEnum):
V_RESMPL_QUERY = auto() # minicpmv
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
V_MM_PATCH_MERGER = auto() # mistral small 3.1
# audio (mtmd)
A_ENC_EMBD_POS = auto()
A_ENC_CONV1D = auto()
A_PRE_NORM = auto()
A_POST_NORM = auto()
A_ENC_ATTN_Q = auto()
A_ENC_ATTN_K = auto()
A_ENC_ATTN_V = auto()
A_ENC_INPUT_NORM = auto()
A_ENC_OUTPUT = auto()
A_ENC_OUTPUT_NORM = auto()
A_ENC_FFN_UP = auto()
A_ENC_FFN_GATE = auto()
A_ENC_FFN_DOWN = auto()
A_MMPROJ = auto()
A_MM_NORM_PRE = auto()
A_MM_NORM_MID = auto()


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
Expand Down Expand Up @@ -776,6 +795,23 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
# audio (mtmd)
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
}

MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
Expand Down Expand Up @@ -819,6 +855,23 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_RESMPL_QUERY,
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
MODEL_TENSOR.V_MM_PATCH_MERGER,
# audio
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_CONV1D,
MODEL_TENSOR.A_PRE_NORM,
MODEL_TENSOR.A_POST_NORM,
MODEL_TENSOR.A_ENC_ATTN_Q,
MODEL_TENSOR.A_ENC_ATTN_K,
MODEL_TENSOR.A_ENC_ATTN_V,
MODEL_TENSOR.A_ENC_INPUT_NORM,
MODEL_TENSOR.A_ENC_OUTPUT,
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
MODEL_TENSOR.A_ENC_FFN_UP,
MODEL_TENSOR.A_ENC_FFN_GATE,
MODEL_TENSOR.A_ENC_FFN_DOWN,
MODEL_TENSOR.A_MMPROJ,
MODEL_TENSOR.A_MM_NORM_PRE,
MODEL_TENSOR.A_MM_NORM_MID,
],
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.TOKEN_EMBD,
Expand Down Expand Up @@ -2186,6 +2239,7 @@ class VisionProjectorType:
LLAMA4 = "llama4"
QWEN2VL = "qwen2vl_merger"
QWEN25VL = "qwen2.5vl_merger"
ULTRAVOX = "ultravox"
INTERNVL = "internvl"


Expand Down
62 changes: 62 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,68 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_PATCH_MERGER: (
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
),

# audio (mtmd)

MODEL_TENSOR.A_ENC_EMBD_POS: (
"audio_tower.embed_positions", # ultravox
),

MODEL_TENSOR.A_ENC_CONV1D: (
"audio_tower.conv{bid}", # ultravox
),

MODEL_TENSOR.A_PRE_NORM: (),

MODEL_TENSOR.A_POST_NORM: (
"audio_tower.layer_norm", # ultravox
),

MODEL_TENSOR.A_ENC_ATTN_Q: (
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
),

MODEL_TENSOR.A_ENC_ATTN_K: (
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
),

MODEL_TENSOR.A_ENC_ATTN_V: (
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
),

MODEL_TENSOR.A_ENC_INPUT_NORM: (
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
),

MODEL_TENSOR.A_ENC_OUTPUT: (
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
),

MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
),

MODEL_TENSOR.A_ENC_FFN_UP: (
"audio_tower.layers.{bid}.fc1", # ultravox
),

MODEL_TENSOR.A_ENC_FFN_GATE: (),

MODEL_TENSOR.A_ENC_FFN_DOWN: (
"audio_tower.layers.{bid}.fc2", # ultravox
),

MODEL_TENSOR.A_MMPROJ: (
"multi_modal_projector.linear_{bid}", # ultravox
),

MODEL_TENSOR.A_MM_NORM_PRE: (
"multi_modal_projector.ln_pre", # ultravox
),

MODEL_TENSOR.A_MM_NORM_MID: (
"multi_modal_projector.ln_mid", # ultravox
),
}

# architecture-specific block mappings
Expand Down
11 changes: 9 additions & 2 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# mtmd

# compile mtmd-audio separately to avoid long compile times
add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(mtmd_audio PRIVATE cxx_std_17)
target_include_directories(mtmd_audio PRIVATE .)

add_library(mtmd OBJECT
mtmd.cpp
mtmd-helper.cpp
Expand All @@ -9,7 +15,7 @@ add_library(mtmd OBJECT
clip-impl.h
)

target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})

target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
Expand All @@ -22,12 +28,13 @@ if (BUILD_SHARED_LIBS)
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_shared LIBRARY)
endif()

if (NOT MSVC)
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
endif()

if(TARGET BUILD_INFO)
Expand Down
22 changes: 20 additions & 2 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"
#define KEY_HAS_AUDIO_ENC "clip.has_audio_encoder"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
Expand All @@ -40,12 +41,14 @@
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"

#define KEY_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"


//
// tensor name constants
//

#define TN_POS_EMBD "v.position_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
Expand Down Expand Up @@ -95,6 +98,12 @@
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"

// ultravox
#define TN_CONV1D "a.conv1d.%d.%s"
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"

// align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

Expand All @@ -110,6 +119,7 @@ enum projector_type {
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL,
PROJECTOR_TYPE_ULTRAVOX,
PROJECTOR_TYPE_INTERNVL,
PROJECTOR_TYPE_LLAMA4,
PROJECTOR_TYPE_UNKNOWN,
Expand All @@ -126,6 +136,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
};
Expand All @@ -149,6 +160,7 @@ struct clip_image_u8 {

// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
// For audio, only one channel is used, buf.size() == nx*ny
struct clip_image_f32 {
int nx;
int ny;
Expand Down Expand Up @@ -242,14 +254,20 @@ struct clip_image_u8_batch {

struct clip_image_f32_batch {
std::vector<clip_image_f32_ptr> entries;
bool is_audio = false;

// for llava-uhd style models, we need to know the grid size
// note: entries.size() == grid_x * grid_y + 1 (one overview image)
int grid_x = 0;
int grid_y = 0;

clip_image_f32_batch clone() const {
clip_image_f32_batch new_batch;
clip_image_f32_batch new_batch{
/* entries */ {},
/* is_audio */ is_audio,
/* grid_x */ grid_x,
/* grid_y */ grid_y,
};
new_batch.entries.reserve(entries.size());
for (const auto & entry : entries) {
new_batch.entries.emplace_back(new clip_image_f32(*entry));
Expand Down
Loading
Loading