Skip to content

Commit 3d302b8

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents b2eb613 + 00fa15f commit 3d302b8

File tree

15 files changed

+554
-56
lines changed

15 files changed

+554
-56
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ models/*
8282
models-mnt
8383
!models/.editorconfig
8484
!models/ggml-vocab-*.gguf*
85+
!models/templates
8586

8687
# Zig
8788
zig-out/

convert_hf_to_gguf.py

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1900,6 +1900,7 @@ def prepare_tensors(self):
19001900
"MixtralForCausalLM",
19011901
"VLlama3ForCausalLM",
19021902
"LlavaForConditionalGeneration",
1903+
"VoxtralForConditionalGeneration",
19031904
"LlamaModel")
19041905
class LlamaModel(TextModel):
19051906
model_arch = gguf.MODEL_ARCH.LLAMA
@@ -1912,6 +1913,11 @@ def __init__(self, *args, **kwargs):
19121913
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
19131914

19141915
def set_vocab(self):
1916+
path_tekken_json = self.dir_model / "tekken.json"
1917+
path_tokenizer_json = self.dir_model / "tokenizer.json"
1918+
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
1919+
return self.set_vocab_tekken()
1920+
19151921
try:
19161922
self._set_vocab_sentencepiece()
19171923
except FileNotFoundError:
@@ -1944,6 +1950,52 @@ def set_vocab(self):
19441950
if self.hparams.get("vocab_size", 32000) == 49152:
19451951
self.gguf_writer.add_add_bos_token(False)
19461952

1953+
def set_vocab_tekken(self):
1954+
vocab = gguf.vocab.MistralVocab(self.dir_model)
1955+
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
1956+
1957+
tokens = []
1958+
scores = []
1959+
toktypes = []
1960+
1961+
for text, score, toktype in vocab.all_tokens():
1962+
tokens.append(text)
1963+
scores.append(score)
1964+
toktypes.append(toktype)
1965+
1966+
assert len(tokens) == vocab.vocab_size, (
1967+
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
1968+
)
1969+
1970+
if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
1971+
self.gguf_writer.add_tokenizer_pre("tekken")
1972+
self.gguf_writer.add_token_merges(
1973+
vocab.extract_vocab_merges_from_model()
1974+
)
1975+
1976+
logger.info(
1977+
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
1978+
)
1979+
1980+
self.gguf_writer.add_bos_token_id(vocab.bos_id)
1981+
self.gguf_writer.add_eos_token_id(vocab.eos_id)
1982+
self.gguf_writer.add_unk_token_id(vocab.unk_id)
1983+
self.gguf_writer.add_pad_token_id(vocab.pad_id)
1984+
1985+
self.gguf_writer.add_token_list(tokens)
1986+
self.gguf_writer.add_token_scores(scores)
1987+
self.gguf_writer.add_token_types(toktypes)
1988+
self.gguf_writer.add_vocab_size(vocab.vocab_size)
1989+
1990+
self.gguf_writer.add_add_bos_token(True)
1991+
self.gguf_writer.add_add_eos_token(False)
1992+
1993+
script_dir = Path(__file__).parent
1994+
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
1995+
with open(template_path, "r", encoding="utf-8") as f:
1996+
template = f.read()
1997+
self.gguf_writer.add_chat_template(template)
1998+
19471999
def set_gguf_parameters(self):
19482000
super().set_gguf_parameters()
19492001
hparams = self.hparams
@@ -1971,12 +2023,13 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
19712023
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
19722024
n_head = self.hparams["num_attention_heads"]
19732025
n_kv_head = self.hparams.get("num_key_value_heads")
1974-
is_vision_tensor = "vision_tower" in name \
2026+
is_multimodal_tensor = "vision_tower" in name \
19752027
or "vision_model" in name \
2028+
or "audio_tower" in name \
19762029
or "model.connector" in name \
19772030
or "multi_modal_projector" in name
19782031

1979-
if is_vision_tensor:
2032+
if is_multimodal_tensor:
19802033
return [] # skip vision tensors
19812034
elif self.hf_arch == "LlamaModel":
19822035
name = "model." + name
@@ -7231,9 +7284,10 @@ class WhisperEncoderModel(MmprojModel):
72317284

72327285
def __init__(self, *args, **kwargs):
72337286
super().__init__(*args, **kwargs)
7234-
self.hparams["hidden_size"] = self.hparams["d_model"]
7235-
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
7236-
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
7287+
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
7288+
self.hparams["hidden_size"] = self.hparams["d_model"]
7289+
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
7290+
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
72377291

72387292
def set_gguf_parameters(self):
72397293
super().set_gguf_parameters()
@@ -7272,9 +7326,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
72727326

72737327
def set_gguf_parameters(self):
72747328
super().set_gguf_parameters()
7329+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
72757330
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
72767331

72777332

7333+
@ModelBase.register("VoxtralForConditionalGeneration")
7334+
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
7335+
has_vision_encoder = False # no vision encoder
7336+
has_audio_encoder = True
7337+
7338+
def set_gguf_parameters(self):
7339+
super().set_gguf_parameters()
7340+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
7341+
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
7342+
7343+
72787344
@ModelBase.register("FalconH1ForCausalLM")
72797345
class FalconH1Model(Mamba2Model):
72807346
model_arch = gguf.MODEL_ARCH.FALCON_H1

docs/multimodal.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ NOTE: some models may require large context window, for example: `-c 8192`
9797
# Qwen2-Audio and SeaLLM-Audio
9898
# note: no pre-quantized GGUF this model, as they have very poor result
9999
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
100+
101+
# Mistral's Voxtral
102+
(tool_name) -hf ggml-org/Voxtral-Mini-3B-2507-GGUF
100103
```
101104

102105
**Mixed modalities**:

ggml/src/ggml-cuda/fattn-vec-f16.cuh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,10 @@ static __global__ void flash_attn_vec_ext_f16(
174174
K += blockIdx.y*D * nb11;
175175
V += blockIdx.y*D * nb21;
176176
maskh += blockIdx.y*D;
177-
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
177+
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D,
178+
// Increment pointers after each loop:
179+
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
180+
178181
// Calculate KQ tile and keep track of new maximum KQ values:
179182

180183
if (mask) {
@@ -291,10 +294,6 @@ static __global__ void flash_attn_vec_ext_f16(
291294
}
292295
}
293296

294-
K += gridDim.y*D * nb11;
295-
V += gridDim.y*D * nb21;
296-
maskh += gridDim.y*D;
297-
298297
__syncthreads();
299298
}
300299

ggml/src/ggml-cuda/fattn-vec-f32.cuh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,10 @@ static __global__ void flash_attn_vec_ext_f32(
180180
K += blockIdx.y*D * nb11;
181181
V += blockIdx.y*D * nb21;
182182
maskh += blockIdx.y*D;
183-
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
183+
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D,
184+
// Increment pointers after each loop:
185+
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
186+
184187
// Calculate KQ tile and keep track of new maximum KQ values:
185188

186189
if (mask) {
@@ -286,10 +289,6 @@ static __global__ void flash_attn_vec_ext_f32(
286289
}
287290
}
288291

289-
K += gridDim.y*D * nb11;
290-
V += gridDim.y*D * nb21;
291-
maskh += gridDim.y*D;
292-
293292
__syncthreads();
294293
}
295294

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2724,6 +2724,7 @@ class VisionProjectorType:
27242724
INTERNVL = "internvl"
27252725
QWEN2A = "qwen2a" # audio
27262726
QWEN25O = "qwen2.5o" # omni
2727+
VOXTRAL = "voxtral"
27272728

27282729

27292730
# Items here are (block size, type size)

0 commit comments

Comments
 (0)