Skip to content

Commit 62695aa

Browse files
committed
(wip) convert ultravox-enc to gguf
1 parent 2004644 commit 62695aa

File tree

4 files changed

+116
-0
lines changed

4 files changed

+116
-0
lines changed

convert_hf_to_gguf.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5277,6 +5277,56 @@ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
52775277
return data_torch
52785278

52795279

5280+
@Model.register("UltravoxModel")
5281+
class UltravoxEncoderModel(Model):
5282+
model_arch = gguf.MODEL_ARCH.ULTRAVOX_ENC
5283+
5284+
def __init__(self, *args, **kwargs):
5285+
super().__init__(*args, **kwargs)
5286+
audio_config = self.hparams["audio_config"]
5287+
self.block_count = audio_config["encoder_layers"]
5288+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
5289+
5290+
def set_gguf_parameters(self):
5291+
audio_config = self.hparams["audio_config"]
5292+
self.gguf_writer.add_context_length(audio_config["max_source_positions"])
5293+
self.gguf_writer.add_embedding_length(audio_config["d_model"])
5294+
self.gguf_writer.add_feed_forward_length(audio_config["encoder_ffn_dim"])
5295+
self.gguf_writer.add_head_count(audio_config["encoder_attention_heads"])
5296+
self.gguf_writer.add_head_count_kv(audio_config["encoder_attention_heads"])
5297+
self.gguf_writer.add_layer_norm_eps(1e-5) # default from whisper
5298+
self.gguf_writer.add_block_count(audio_config["encoder_layers"])
5299+
self.gguf_writer.add_n_mel_bins(audio_config["num_mel_bins"])
5300+
# We only have encoder, so we will always use non-causal attention
5301+
self.gguf_writer.add_causal_attention(False)
5302+
5303+
def set_vocab(self):
5304+
self._set_vocab_none()
5305+
5306+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
5307+
# TODO: maybe we can generate these filters ourselves?
5308+
from huggingface_hub import hf_hub_download
5309+
mel_filters_path = hf_hub_download(
5310+
repo_id="ggml-org/models",
5311+
filename="mel_filters.npz",
5312+
)
5313+
with np.load(mel_filters_path) as f:
5314+
yield ("mel_filters", torch.from_numpy(f["mel_128"]))
5315+
5316+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5317+
del bid # unused
5318+
5319+
name = name.replace("audio_tower.layers.", "model.layers.")
5320+
name = name.replace(".fc", ".mlp.fc")
5321+
name = name.replace(".self_attn_layer_norm", ".input_layernorm")
5322+
name = name.replace(".final_layer_norm", ".post_attention_layernorm")
5323+
5324+
if "conv1.bias" in name or "conv2.bias" in name:
5325+
data_torch = data_torch.unsqueeze(-1).transpose(0, 1)
5326+
5327+
return [(self.map_tensor_name(name), data_torch)]
5328+
5329+
52805330
###### CONVERSION LOGIC ######
52815331

52825332

gguf-py/gguf/constants.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,9 @@ class ConvNext:
173173
EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
174174
BLOCK_COUNT = "{arch}.convnext.block_count"
175175

176+
class Whisper:
177+
N_MEL_BINS = "{arch}.n_mel_bins"
178+
176179
class Tokenizer:
177180
MODEL = "tokenizer.ggml.model"
178181
PRE = "tokenizer.ggml.pre"
@@ -288,6 +291,7 @@ class MODEL_ARCH(IntEnum):
288291
WAVTOKENIZER_DEC = auto()
289292
PLM = auto()
290293
BAILINGMOE = auto()
294+
ULTRAVOX_ENC = auto()
291295

292296

293297
class MODEL_TENSOR(IntEnum):
@@ -427,6 +431,13 @@ class MODEL_TENSOR(IntEnum):
427431
POSNET_ATTN_K = auto()
428432
POSNET_ATTN_V = auto()
429433
POSNET_ATTN_OUT = auto()
434+
WHISPER_CONV1 = auto()
435+
WHISPER_CONV2 = auto()
436+
WHISPER_MEL_FILTERS = auto()
437+
MM_PROJ_MLP_1 = auto() # ultravox
438+
MM_PROJ_MLP_2 = auto() # ultravox
439+
MM_PROJ_NORM_MID = auto() # ultravox
440+
MM_PROJ_NORM_PRE = auto() # ultravox
430441

431442

432443
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -492,6 +503,7 @@ class MODEL_TENSOR(IntEnum):
492503
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
493504
MODEL_ARCH.PLM: "plm",
494505
MODEL_ARCH.BAILINGMOE: "bailingmoe",
506+
MODEL_ARCH.ULTRAVOX_ENC: "ultravox-enc",
495507
}
496508

497509
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -631,6 +643,13 @@ class MODEL_TENSOR(IntEnum):
631643
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
632644
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
633645
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
646+
MODEL_TENSOR.WHISPER_CONV1: "whisper.conv1",
647+
MODEL_TENSOR.WHISPER_CONV2: "whisper.conv2",
648+
MODEL_TENSOR.WHISPER_MEL_FILTERS: "whisper.mel_filters",
649+
MODEL_TENSOR.MM_PROJ_MLP_1: "mm.proj.mlp_1",
650+
MODEL_TENSOR.MM_PROJ_MLP_2: "mm.proj.mlp_2",
651+
MODEL_TENSOR.MM_PROJ_NORM_MID: "mm.proj.norm_mid",
652+
MODEL_TENSOR.MM_PROJ_NORM_PRE: "mm.proj.norm_pre",
634653
}
635654

636655
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1688,6 +1707,25 @@ class MODEL_TENSOR(IntEnum):
16881707
MODEL_TENSOR.FFN_DOWN_SHEXP,
16891708
MODEL_TENSOR.FFN_UP_SHEXP,
16901709
],
1710+
MODEL_ARCH.ULTRAVOX_ENC: [
1711+
MODEL_TENSOR.POS_EMBD,
1712+
MODEL_TENSOR.WHISPER_CONV1,
1713+
MODEL_TENSOR.WHISPER_CONV2,
1714+
MODEL_TENSOR.WHISPER_MEL_FILTERS,
1715+
MODEL_TENSOR.OUTPUT_NORM,
1716+
MODEL_TENSOR.ATTN_NORM,
1717+
MODEL_TENSOR.ATTN_K,
1718+
MODEL_TENSOR.ATTN_Q,
1719+
MODEL_TENSOR.ATTN_V,
1720+
MODEL_TENSOR.ATTN_OUT,
1721+
MODEL_TENSOR.FFN_NORM,
1722+
MODEL_TENSOR.FFN_UP,
1723+
MODEL_TENSOR.FFN_DOWN,
1724+
MODEL_TENSOR.MM_PROJ_MLP_1, # ultravox
1725+
MODEL_TENSOR.MM_PROJ_MLP_2, # ultravox
1726+
MODEL_TENSOR.MM_PROJ_NORM_MID, # ultravox
1727+
MODEL_TENSOR.MM_PROJ_NORM_PRE, # ultravox
1728+
],
16911729
# TODO
16921730
}
16931731

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,9 @@ def add_remove_extra_whitespaces(self, value: bool) -> None:
887887
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
888888
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
889889

890+
def add_n_mel_bins(self, value: int) -> None:
891+
self.add_uint32(Keys.Whisper.N_MEL_BINS, value)
892+
890893
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
891894
if not isinstance(value, str):
892895
template_default = None

gguf-py/gguf/tensor_mapping.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class TensorNameMap:
5555
"transformer.wpe", # gpt2
5656
"embeddings.position_embeddings", # bert
5757
"wpe", # gpt2
58+
"audio_tower.embed_positions", # ultravox
5859
),
5960

6061
# Output
@@ -89,6 +90,7 @@ class TensorNameMap:
8990
"rwkv.ln_out", # rwkv6
9091
"model.ln_out", # rwkv7
9192
"backbone.final_layer_norm", # wavtokenizer
93+
"audio_tower.layer_norm", # ultravox
9294
),
9395

9496
# Rope frequencies
@@ -103,6 +105,28 @@ class TensorNameMap:
103105
MODEL_TENSOR.CONV1D: (
104106
"backbone.embed", # roberta
105107
),
108+
109+
MODEL_TENSOR.WHISPER_CONV1: (
110+
"audio_tower.conv1",
111+
),
112+
MODEL_TENSOR.WHISPER_CONV2: (
113+
"audio_tower.conv2",
114+
),
115+
MODEL_TENSOR.MM_PROJ_MLP_1: (
116+
"multi_modal_projector.linear_1", # ultravox
117+
),
118+
MODEL_TENSOR.MM_PROJ_MLP_2: (
119+
"multi_modal_projector.linear_2", # ultravox
120+
),
121+
MODEL_TENSOR.MM_PROJ_NORM_MID: (
122+
"multi_modal_projector.ln_mid", # ultravox
123+
),
124+
MODEL_TENSOR.MM_PROJ_NORM_PRE: (
125+
"multi_modal_projector.ln_pre", # ultravox
126+
),
127+
MODEL_TENSOR.WHISPER_MEL_FILTERS: (
128+
"mel_filters",
129+
),
106130
}
107131

108132
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@@ -206,6 +230,7 @@ class TensorNameMap:
206230
"transformer.h.{bid}.self_attention.dense", # falcon
207231
"h.{bid}.self_attention.dense", # bloom
208232
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
233+
"model.layers.{bid}.self_attn.out_proj" , # ultravox
209234
"model.layers.{bid}.self_attn.linear_attn", # deci
210235
"layers.{bid}.attention.wo", # llama-pth
211236
"encoder.layer.{bid}.attention.output.dense", # bert

0 commit comments

Comments
 (0)