Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ca0ef2d
llama : clarify comment about pp and tg graphs [no ci] (#14895)
danbev Jul 27, 2025
bbfc849
SYCL: add ops doc (#14901)
qnixsynapse Jul 27, 2025
bf78f54
vulkan: add ops docs (#14900)
0cc4m Jul 27, 2025
7f97599
quantize : update README.md (#14905)
EAddario Jul 27, 2025
613c509
cmake : Indent ggml-config.cmake (ggml/1310)
dg0yt Jul 24, 2025
1f45f28
sync : ggml
ggerganov Jul 28, 2025
c35f9ea
ops : update Metal (#14912)
ggerganov Jul 28, 2025
a5771c9
ops : update BLAS (#14914)
ggerganov Jul 28, 2025
afc0e89
sycl: refactor quantization to q8_1 (#14815)
Alcpz Jul 28, 2025
6c6e397
model : add support for SmallThinker series (#14898)
wdl339 Jul 28, 2025
946b1f6
CUDA: fix pointer incrementation in FA (#14916)
JohannesGaessler Jul 28, 2025
00fa15f
mtmd : add support for Voxtral (#14862)
ngxson Jul 28, 2025
cd1fce6
SYCL: Add set_rows support for quantized types (#14883)
qnixsynapse Jul 28, 2025
db16e28
ggml-cpu : deduplicate scalar implementations (#14897)
xctan Jul 28, 2025
c556418
llama-bench : use local GPUs along with RPC servers (#14917)
rgerganov Jul 28, 2025
bda6219
test-backend-ops : extend test case filtering (#14865)
tlemo Jul 28, 2025
8ad7b3e
opencl : add ops docs (#14910)
lhez Jul 28, 2025
0a5036b
CUDA: add roll (#14919)
am17an Jul 29, 2025
bbd0f91
server-bench: make seed choice configurable (#14929)
JohannesGaessler Jul 29, 2025
138b288
cuda : add softcap fusion (#14907)
CISC Jul 29, 2025
204f2cf
CANN: Add ggml_set_rows (#14943)
hipudding Jul 29, 2025
1a67fcc
common : avoid logging partial messages (which can contain broken UTF…
kallewoof Jul 29, 2025
c7aa136
HIP: Ignore unsupported unroll transformation in fattn-vec (#14931)
IMbackK Jul 29, 2025
b77d111
HIP: add GGML_HIP_MMQ_MFMA option to allow disableing the MFMA path. …
IMbackK Jul 29, 2025
aa79524
HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only …
IMbackK Jul 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ models/*
models-mnt
!models/.editorconfig
!models/ggml-vocab-*.gguf*
!models/templates

# Zig
zig-out/
Expand Down
4 changes: 3 additions & 1 deletion common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1944,6 +1944,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
}
}
auto msg = builder.result();
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
if (!is_partial) {
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
}
return msg;
}
158 changes: 153 additions & 5 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1900,6 +1900,7 @@ def prepare_tensors(self):
"MixtralForCausalLM",
"VLlama3ForCausalLM",
"LlavaForConditionalGeneration",
"VoxtralForConditionalGeneration",
"LlamaModel")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
Expand All @@ -1912,6 +1913,11 @@ def __init__(self, *args, **kwargs):
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)

def set_vocab(self):
path_tekken_json = self.dir_model / "tekken.json"
path_tokenizer_json = self.dir_model / "tokenizer.json"
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
return self.set_vocab_tekken()

try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
Expand Down Expand Up @@ -1944,6 +1950,52 @@ def set_vocab(self):
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)

def set_vocab_tekken(self):
vocab = gguf.vocab.MistralVocab(self.dir_model)
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)

tokens = []
scores = []
toktypes = []

for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)

assert len(tokens) == vocab.vocab_size, (
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
)

if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
self.gguf_writer.add_tokenizer_pre("tekken")
self.gguf_writer.add_token_merges(
vocab.extract_vocab_merges_from_model()
)

logger.info(
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
)

self.gguf_writer.add_bos_token_id(vocab.bos_id)
self.gguf_writer.add_eos_token_id(vocab.eos_id)
self.gguf_writer.add_unk_token_id(vocab.unk_id)
self.gguf_writer.add_pad_token_id(vocab.pad_id)

self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_vocab_size(vocab.vocab_size)

self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(False)

script_dir = Path(__file__).parent
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
with open(template_path, "r", encoding="utf-8") as f:
template = f.read()
self.gguf_writer.add_chat_template(template)

def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
Expand Down Expand Up @@ -1971,12 +2023,13 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
is_vision_tensor = "vision_tower" in name \
is_multimodal_tensor = "vision_tower" in name \
or "vision_model" in name \
or "audio_tower" in name \
or "model.connector" in name \
or "multi_modal_projector" in name

if is_vision_tensor:
if is_multimodal_tensor:
return [] # skip vision tensors
elif self.hf_arch == "LlamaModel":
name = "model." + name
Expand Down Expand Up @@ -7231,9 +7284,10 @@ class WhisperEncoderModel(MmprojModel):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hparams["hidden_size"] = self.hparams["d_model"]
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
self.hparams["hidden_size"] = self.hparams["d_model"]
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]

def set_gguf_parameters(self):
super().set_gguf_parameters()
Expand Down Expand Up @@ -7272,9 +7326,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])


@ModelBase.register("VoxtralForConditionalGeneration")
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
has_vision_encoder = False # no vision encoder
has_audio_encoder = True

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size


@ModelBase.register("FalconH1ForCausalLM")
class FalconH1Model(Mamba2Model):
model_arch = gguf.MODEL_ARCH.FALCON_H1
Expand Down Expand Up @@ -7589,6 +7655,88 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return [(self.map_tensor_name(name), data_torch)]


@ModelBase.register("SmallThinkerForCausalLM")
class SmallThinkerModel(TextModel):
model_arch = gguf.MODEL_ARCH.SMALLTHINKER

def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
self.gguf_writer.add_feed_forward_length(moe_intermediate_size)
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
if (self.hparams.get('moe_primary_router_apply_softmax')):
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
else:
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
# YaRN is not enabled by default
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

sliding_window_layout = self.hparams.get("sliding_window_layout")
if sliding_window_layout:
for i in sliding_window_layout:
if i != 0:
sliding_window = self.hparams.get("sliding_window_size")
if sliding_window:
self.gguf_writer.add_sliding_window(sliding_window)
break

_experts: list[dict[str, Tensor]] | None = None

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
assert bid is not None

if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]

self._experts[bid][name] = data_torch

if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []

# merge the experts into a single 3d tensor
for w_name in ["down", "gate", "up"]:
datas: list[Tensor] = []

for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]

data_torch = torch.stack(datas, dim=0)

merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"

new_name = self.map_tensor_name(merged_name)

tensors.append((new_name, data_torch))
return tensors
else:
return []

return [(self.map_tensor_name(name), data_torch)]

def prepare_tensors(self):
super().prepare_tensors()

if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")

###### CONVERSION LOGIC ######


Expand Down
3 changes: 3 additions & 0 deletions docs/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ NOTE: some models may require large context window, for example: `-c 8192`
# Qwen2-Audio and SeaLLM-Audio
# note: no pre-quantized GGUF this model, as they have very poor result
# ref: https://github.com/ggml-org/llama.cpp/pull/13760

# Mistral's Voxtral
(tool_name) -hf ggml-org/Voxtral-Mini-3B-2507-GGUF
```

**Mixed modalities**:
Expand Down
Loading