Skip to content

Commit cd909c2

Browse files
committed
mtmd : add support for Voxtral
1 parent 2df255d commit cd909c2

File tree

8 files changed

+510
-20
lines changed

8 files changed

+510
-20
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ models/*
8282
models-mnt
8383
!models/.editorconfig
8484
!models/ggml-vocab-*.gguf*
85+
!models/templates
8586

8687
# Zig
8788
zig-out/

convert_hf_to_gguf.py

Lines changed: 73 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2260,6 +2260,63 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
22602260
return super().modify_tensors(data_torch, name, bid)
22612261

22622262

2263+
@ModelBase.register("VoxtralForConditionalGeneration")
2264+
class VoxtralModel(LlamaModel):
2265+
model_arch = gguf.MODEL_ARCH.LLAMA
2266+
2267+
def set_vocab(self):
2268+
vocab = gguf.vocab.MistralVocab(self.dir_model)
2269+
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
2270+
2271+
tokens = []
2272+
scores = []
2273+
toktypes = []
2274+
2275+
for text, score, toktype in vocab.all_tokens():
2276+
tokens.append(text)
2277+
scores.append(score)
2278+
toktypes.append(toktype)
2279+
2280+
assert len(tokens) == vocab.vocab_size, (
2281+
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
2282+
)
2283+
2284+
if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
2285+
self.gguf_writer.add_tokenizer_pre("tekken")
2286+
self.gguf_writer.add_token_merges(
2287+
vocab.extract_vocab_merges_from_model()
2288+
)
2289+
2290+
logger.info(
2291+
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
2292+
)
2293+
2294+
self.gguf_writer.add_bos_token_id(vocab.bos_id)
2295+
self.gguf_writer.add_eos_token_id(vocab.eos_id)
2296+
self.gguf_writer.add_unk_token_id(vocab.unk_id)
2297+
self.gguf_writer.add_pad_token_id(vocab.pad_id)
2298+
2299+
self.gguf_writer.add_token_list(tokens)
2300+
self.gguf_writer.add_token_scores(scores)
2301+
self.gguf_writer.add_token_types(toktypes)
2302+
self.gguf_writer.add_vocab_size(vocab.vocab_size)
2303+
2304+
self.gguf_writer.add_add_bos_token(True)
2305+
self.gguf_writer.add_add_eos_token(False)
2306+
2307+
script_dir = Path(__file__).parent
2308+
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
2309+
with open(template_path, "r", encoding="utf-8") as f:
2310+
template = f.read()
2311+
self.gguf_writer.add_chat_template(template)
2312+
2313+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2314+
name = name.replace("language_model.", "")
2315+
if "multi_modal_projector" in name or "audio_tower" in name:
2316+
return []
2317+
return super().modify_tensors(data_torch, name, bid)
2318+
2319+
22632320
@ModelBase.register("DeciLMForCausalLM")
22642321
class DeciModel(TextModel):
22652322
model_arch = gguf.MODEL_ARCH.DECI
@@ -7231,9 +7288,10 @@ class WhisperEncoderModel(MmprojModel):
72317288

72327289
def __init__(self, *args, **kwargs):
72337290
super().__init__(*args, **kwargs)
7234-
self.hparams["hidden_size"] = self.hparams["d_model"]
7235-
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
7236-
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
7291+
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
7292+
self.hparams["hidden_size"] = self.hparams["d_model"]
7293+
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
7294+
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
72377295

72387296
def set_gguf_parameters(self):
72397297
super().set_gguf_parameters()
@@ -7272,9 +7330,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
72727330

72737331
def set_gguf_parameters(self):
72747332
super().set_gguf_parameters()
7333+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
72757334
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
72767335

72777336

7337+
@ModelBase.register("VoxtralForConditionalGeneration")
7338+
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
7339+
has_vision_encoder = False # no vision encoder
7340+
has_audio_encoder = True
7341+
7342+
def set_gguf_parameters(self):
7343+
super().set_gguf_parameters()
7344+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
7345+
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
7346+
7347+
72787348
@ModelBase.register("FalconH1ForCausalLM")
72797349
class FalconH1Model(Mamba2Model):
72807350
model_arch = gguf.MODEL_ARCH.FALCON_H1

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2704,6 +2704,7 @@ class VisionProjectorType:
27042704
INTERNVL = "internvl"
27052705
QWEN2A = "qwen2a" # audio
27062706
QWEN25O = "qwen2.5o" # omni
2707+
VOXTRAL = "voxtral"
27072708

27082709

27092710
# Items here are (block size, type size)

0 commit comments

Comments
 (0)