Skip to content

Commit 4fa0c27

Browse files
committed
convert ok, load ok
1 parent 8ae5ebc commit 4fa0c27

File tree

5 files changed

+400
-26
lines changed

5 files changed

+400
-26
lines changed

convert_hf_to_gguf.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,15 +1121,22 @@ def __init__(self, *args, **kwargs):
11211121
# get n_embd of the text model
11221122
if "text_config" not in self.hparams:
11231123
self.hparams["text_config"] = {}
1124+
# TODO @ngxson : separate VisionModel and AudioModel
1125+
if "audio_config" not in self.hparams:
1126+
self.hparams["audio_config"] = {}
11241127
text_config = {**self.hparams, **self.hparams["text_config"]}
11251128
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
11261129
assert self.n_embd_text > 0, "n_embd not found in hparams"
11271130

1128-
if "vision_config" not in self.hparams:
1129-
raise ValueError("vision_config not found in hparams")
11301131
# move vision config to the top level, while preserving the original hparams in global_config
11311132
self.global_config = self.hparams
1132-
self.hparams = self.hparams["vision_config"]
1133+
1134+
if "vision_config" in self.hparams:
1135+
self.hparams = self.hparams["vision_config"]
1136+
elif "audio_config" in self.hparams:
1137+
self.hparams = self.hparams["audio_config"]
1138+
else:
1139+
raise ValueError("vision_config / audio_config not found in hparams")
11331140

11341141
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
11351142
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
@@ -5808,6 +5815,39 @@ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
58085815
return data_torch
58095816

58105817

5818+
@ModelBase.register("UltravoxModel")
5819+
class UltravoxModel(TextModel):
5820+
model_arch = gguf.MODEL_ARCH.LLAMA # dummy
5821+
def __init__(self, *args, **kwargs):
5822+
super().__init__(*args, **kwargs)
5823+
raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument")
5824+
5825+
5826+
@ModelBase.register("UltravoxModel")
5827+
class UltravoxAudioModel(VisionModel):
5828+
def __init__(self, *args, **kwargs):
5829+
super().__init__(*args, **kwargs)
5830+
self.hparams["image_size"] = 0
5831+
self.hparams["patch_size"] = 0
5832+
self.hparams["hidden_size"] = self.hparams["d_model"]
5833+
self.hparams["intermediate_size"] = self.hparams["d_model"]
5834+
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
5835+
self.preprocessor_config["image_mean"] = [0, 0, 0]
5836+
self.preprocessor_config["image_std"] = [0, 0, 0]
5837+
5838+
def set_gguf_parameters(self):
5839+
super().set_gguf_parameters()
5840+
self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_AUDIO_ENC, True)
5841+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.ULTRAVOX)
5842+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
5843+
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Projector.STACK_FACTOR, self.global_config["stack_factor"])
5844+
5845+
def tensor_force_quant(self, name, new_name, bid, n_dims):
5846+
del bid, new_name, n_dims # unused
5847+
if ".conv" in name:
5848+
return gguf.GGMLQuantizationType.F16
5849+
return False
5850+
58115851
###### CONVERSION LOGIC ######
58125852

58135853

gguf-py/gguf/constants.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ class Adapter:
220220
LORA_ALPHA = "adapter.lora.alpha"
221221

222222
class ClipVision:
223+
HAS_AUDIO_ENC = "clip.has_audio_encoder"
223224
PROJECTOR_TYPE = "clip.projector_type"
224225
HAS_VISION_ENCODER = "clip.has_vision_encoder"
225226
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
@@ -242,6 +243,7 @@ class Attention:
242243

243244
class Projector:
244245
SCALE_FACTOR = "clip.vision.projector.scale_factor"
246+
STACK_FACTOR = "clip.audio.projector.stack_factor"
245247

246248
#
247249
# recommended mapping of model tensor names for storage in gguf
@@ -509,6 +511,23 @@ class MODEL_TENSOR(IntEnum):
509511
V_RESMPL_QUERY = auto() # minicpmv
510512
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
511513
V_MM_PATCH_MERGER = auto() # mistral small 3.1
514+
# audio (mtmd)
515+
A_ENC_EMBD_POS = auto()
516+
A_ENC_CONV1D = auto()
517+
A_PRE_NORM = auto()
518+
A_POST_NORM = auto()
519+
A_ENC_ATTN_Q = auto()
520+
A_ENC_ATTN_K = auto()
521+
A_ENC_ATTN_V = auto()
522+
A_ENC_INPUT_NORM = auto()
523+
A_ENC_OUTPUT = auto()
524+
A_ENC_OUTPUT_NORM = auto()
525+
A_ENC_FFN_UP = auto()
526+
A_ENC_FFN_GATE = auto()
527+
A_ENC_FFN_DOWN = auto()
528+
A_MMPROJ = auto()
529+
A_MM_NORM_PRE = auto()
530+
A_MM_NORM_MID = auto()
512531

513532

514533
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -766,6 +785,23 @@ class MODEL_TENSOR(IntEnum):
766785
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
767786
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
768787
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
788+
# audio (mtmd)
789+
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
790+
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
791+
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
792+
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
793+
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
794+
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
795+
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
796+
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
797+
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
798+
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
799+
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
800+
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
801+
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
802+
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
803+
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
804+
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
769805
}
770806

771807
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -804,6 +840,23 @@ class MODEL_TENSOR(IntEnum):
804840
MODEL_TENSOR.V_RESMPL_QUERY,
805841
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
806842
MODEL_TENSOR.V_MM_PATCH_MERGER,
843+
# audio
844+
MODEL_TENSOR.A_ENC_EMBD_POS,
845+
MODEL_TENSOR.A_ENC_CONV1D,
846+
MODEL_TENSOR.A_PRE_NORM,
847+
MODEL_TENSOR.A_POST_NORM,
848+
MODEL_TENSOR.A_ENC_ATTN_Q,
849+
MODEL_TENSOR.A_ENC_ATTN_K,
850+
MODEL_TENSOR.A_ENC_ATTN_V,
851+
MODEL_TENSOR.A_ENC_INPUT_NORM,
852+
MODEL_TENSOR.A_ENC_OUTPUT,
853+
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
854+
MODEL_TENSOR.A_ENC_FFN_UP,
855+
MODEL_TENSOR.A_ENC_FFN_GATE,
856+
MODEL_TENSOR.A_ENC_FFN_DOWN,
857+
MODEL_TENSOR.A_MMPROJ,
858+
MODEL_TENSOR.A_MM_NORM_PRE,
859+
MODEL_TENSOR.A_MM_NORM_MID,
807860
],
808861
MODEL_ARCH.LLAMA: [
809862
MODEL_TENSOR.TOKEN_EMBD,
@@ -2167,6 +2220,7 @@ class VisionProjectorType:
21672220
PIXTRAL = "pixtral"
21682221
QWEN2VL = "qwen2vl_merger"
21692222
QWEN25VL = "qwen2.5vl_merger"
2223+
ULTRAVOX = "ultravox"
21702224

21712225

21722226
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,6 +1077,68 @@ class TensorNameMap:
10771077
MODEL_TENSOR.V_MM_PATCH_MERGER: (
10781078
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
10791079
),
1080+
1081+
# audio (mtmd)
1082+
1083+
MODEL_TENSOR.A_ENC_EMBD_POS: (
1084+
"audio_tower.embed_positions", # ultravox
1085+
),
1086+
1087+
MODEL_TENSOR.A_ENC_CONV1D: (
1088+
"audio_tower.conv{bid}", # ultravox
1089+
),
1090+
1091+
MODEL_TENSOR.A_PRE_NORM: (
1092+
"audio_tower.layer_norm", # ultravox
1093+
),
1094+
1095+
MODEL_TENSOR.A_POST_NORM: (),
1096+
1097+
MODEL_TENSOR.A_ENC_ATTN_Q: (
1098+
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
1099+
),
1100+
1101+
MODEL_TENSOR.A_ENC_ATTN_K: (
1102+
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
1103+
),
1104+
1105+
MODEL_TENSOR.A_ENC_ATTN_V: (
1106+
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
1107+
),
1108+
1109+
MODEL_TENSOR.A_ENC_INPUT_NORM: (
1110+
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
1111+
),
1112+
1113+
MODEL_TENSOR.A_ENC_OUTPUT: (
1114+
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
1115+
),
1116+
1117+
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
1118+
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
1119+
),
1120+
1121+
MODEL_TENSOR.A_ENC_FFN_UP: (
1122+
"audio_tower.layers.{bid}.fc1", # ultravox
1123+
),
1124+
1125+
MODEL_TENSOR.A_ENC_FFN_GATE: (),
1126+
1127+
MODEL_TENSOR.A_ENC_FFN_DOWN: (
1128+
"audio_tower.layers.{bid}.fc2", # ultravox
1129+
),
1130+
1131+
MODEL_TENSOR.A_MMPROJ: (
1132+
"multi_modal_projector.linear_{bid}", # ultravox
1133+
),
1134+
1135+
MODEL_TENSOR.A_MM_NORM_PRE: (
1136+
"multi_modal_projector.ln_pre", # ultravox
1137+
),
1138+
1139+
MODEL_TENSOR.A_MM_NORM_MID: (
1140+
"multi_modal_projector.ln_mid", # ultravox
1141+
),
10801142
}
10811143

10821144
# architecture-specific block mappings

tools/llava/clip-impl.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#define KEY_FTYPE "general.file_type"
1616
#define KEY_NAME "general.name"
1717
#define KEY_DESCRIPTION "general.description"
18+
#define KEY_HAS_AUDIO_ENC "clip.has_audio_encoder"
1819
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
1920
#define KEY_USE_GELU "clip.use_gelu"
2021
#define KEY_USE_SILU "clip.use_silu"
@@ -42,6 +43,8 @@
4243
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
4344
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
4445

46+
#define KEY_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
47+
4548

4649
//
4750
// tensor name constants
@@ -93,6 +96,12 @@
9396
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
9497
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
9598

99+
// ultravox
100+
#define TN_CONV1D "a.conv1d.%d.%s"
101+
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
102+
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
103+
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
104+
96105
enum projector_type {
97106
PROJECTOR_TYPE_MLP,
98107
PROJECTOR_TYPE_MLP_NORM,
@@ -105,6 +114,7 @@ enum projector_type {
105114
PROJECTOR_TYPE_IDEFICS3,
106115
PROJECTOR_TYPE_PIXTRAL,
107116
PROJECTOR_TYPE_QWEN25VL,
117+
PROJECTOR_TYPE_ULTRAVOX,
108118
PROJECTOR_TYPE_UNKNOWN,
109119
};
110120

@@ -119,6 +129,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
119129
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
120130
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
121131
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
132+
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
122133
};
123134

124135
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)