Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
5f32e9e
CANN: ROPE cache sin/cos repeat (#15501)
noemotiovon Aug 25, 2025
794deb5
convert : support interns1-mini (#15412)
RunningLeon Aug 25, 2025
5636357
metal : add FA kernels for HS=40 (#15559)
ggerganov Aug 25, 2025
aad6558
convert : update Ernie 4.5 dense architecture name (#15555)
ownia Aug 25, 2025
dd7bb1b
batched-bench : fix unified KV cache handling + pp timing (#15562)
ggerganov Aug 25, 2025
e556439
model-conversion : add model card template for embeddings [no ci] (#1…
danbev Aug 25, 2025
1436e07
model-conversion : set pooling type to none in logits.cpp (#15564)
danbev Aug 25, 2025
6e3f6c1
CUDA: MoE helper in device code, better tile sizes (#15525)
JohannesGaessler Aug 25, 2025
83a1a80
metal: fix regression when no metal devices are present (#15531)
booxter Aug 25, 2025
7addd18
tests: Generate unique input values for count_equal (#15487)
jeffbolznv Aug 25, 2025
363b0fc
vulkan: fix min subgroup 16 condition for mmid subgroup optimization …
0cc4m Aug 25, 2025
f2cf572
opencl: fix support ops condition for `rms_norm` (#15560)
lhez Aug 25, 2025
72d1cb0
CUDA: Accelerate MXFP4 table lookup using `__byte_perm` (#15451)
Qeeweew Aug 25, 2025
2e9f3f6
vulkan: Remove splitting for mul_mat_id (#15568)
jeffbolznv Aug 26, 2025
830e2f4
Add a warning for special devices (#15563)
pt13762104 Aug 26, 2025
ac05165
metal : remove contiguous assertion for src0 in IM2COL (#15577)
CISC Aug 26, 2025
384c7df
gguf-py : remove erroneous FFN_GATE entry (#15583)
CISC Aug 26, 2025
b2b0b30
model : support MiniCPM-V 4.5 (#15575)
tc-mb Aug 26, 2025
454e379
metal : improve `MUL_MAT_ID` (#15541)
ggerganov Aug 26, 2025
88e1081
context : print graph stats for memory-less contexts (#15586)
ggerganov Aug 26, 2025
d978dc7
mtmd : support Kimi VL model (#15458)
ngxson Aug 26, 2025
7068477
metal : optimize FA vec for large sequences and BS <= 8 (#15566)
ggerganov Aug 26, 2025
5707c65
CUDA: return -1 for nonexistent compiled arch (#15587)
JohannesGaessler Aug 26, 2025
94b2d97
model-conversion : add qat-q4 quantization targets (#15588)
danbev Aug 26, 2025
de85c30
graph : fix assert in memory-less build_attn (#15590)
ggerganov Aug 26, 2025
c2d7cfd
llamafile: PowerPC Sgemm Optimization (#15558)
shalinib-ibm Aug 26, 2025
6eebd96
tests: add performance test for mul mat id (#15543)
netrunnereve Aug 26, 2025
524254e
mtmd : fix mtmd ios build (#15579)
fidoriel Aug 26, 2025
ed349df
SYCL: fix rms_norm_mul_add for tensor dim not a multiple of sg_size (…
qnixsynapse Aug 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 108 additions & 72 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,6 +1216,55 @@ def _try_set_pooling_type(self) -> None:
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type)

def _set_vocab_interns1(self):
tokens: list[str] = []
toktypes: list[int] = []

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
vocab_size = self.hparams.get("vocab_size", len(vocab))
assert max(vocab.values()) < vocab_size

tokpre = self.get_vocab_base_pre(tokenizer)

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
added_vocab = tokenizer.get_added_vocab()

added_tokens_decoder = tokenizer.added_tokens_decoder

for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.UNUSED)
else:
token: str = reverse_vocab[i]
if token in added_vocab:
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

if added_tokens_decoder[i].special or self.does_token_look_special(token):
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
toktypes.append(gguf.TokenType.NORMAL)
tokens.append(token)

self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab._set_special_token("bos", 151643)
special_vocab.add_to_gguf(self.gguf_writer)


class MmprojModel(ModelBase):
model_type = ModelType.MMPROJ
Expand Down Expand Up @@ -2932,7 +2981,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
if "language_model." in name:
name = name.replace("language_model.", "") # for InternVL
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
or name.startswith("vision_model") or name.startswith("audio_tower"):
or name.startswith("vision_model") or name.startswith("audio_tower") \
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
# skip vision and audio tensors
return []
yield from super().modify_tensors(data_torch, name, bid)
Expand Down Expand Up @@ -3109,7 +3159,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Ernie4_5_ForCausalLM")
@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM")
class Ernie4_5Model(TextModel):
model_arch = gguf.MODEL_ARCH.ERNIE4_5

Expand Down Expand Up @@ -3604,6 +3654,19 @@ def prepare_tensors(self):
class Qwen3Model(Qwen2Model):
model_arch = gguf.MODEL_ARCH.QWEN3

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
self.origin_hf_arch = hparams.get('architectures', [None])[0]

def set_vocab(self):
# deal with intern-s1-mini
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
self._set_vocab_interns1()
return

super().set_vocab()


@ModelBase.register("Qwen3MoeForCausalLM")
class Qwen3MoeModel(Qwen2MoeModel):
Expand All @@ -3620,73 +3683,7 @@ def set_vocab(self):
self._set_vocab_interns1()
return

try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()

def _set_vocab_interns1(self):
tokens: list[str] = []
toktypes: list[int] = []

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
vocab_size = self.hparams.get("vocab_size", len(vocab))
assert max(vocab.values()) < vocab_size

tokpre = self.get_vocab_base_pre(tokenizer)

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
added_vocab = tokenizer.get_added_vocab()

added_tokens_decoder = tokenizer.added_tokens_decoder

for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.UNUSED)
else:
token: str = reverse_vocab[i]
if token in added_vocab:
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

if added_tokens_decoder[i].special or self.does_token_look_special(token):
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
toktypes.append(gguf.TokenType.NORMAL)
tokens.append(token)

self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
additional_special_tokens = []
if special_tokens_map_file.is_file():
with open(special_tokens_map_file, encoding = 'utf-8') as f:
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
if tokenizer_cfg_file.is_file():
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
for token in additional_special_tokens:
if token in token2ids_map:
special_vocab._set_special_token(token, token2ids_map[token])
special_vocab._set_special_token('eos', 151645)
special_vocab._set_special_token("bos", 151643)
special_vocab.add_to_gguf(self.gguf_writer)
super().set_vocab()


@ModelBase.register("GPT2LMHeadModel")
Expand Down Expand Up @@ -6257,9 +6254,11 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("DeepseekV2ForCausalLM")
@ModelBase.register("DeepseekV3ForCausalLM")
@ModelBase.register("KimiVLForConditionalGeneration")
@ModelBase.register(
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"KimiVLForConditionalGeneration",
)
class DeepseekV2Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2

Expand Down Expand Up @@ -8510,6 +8509,43 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
return "mm.2.weight"
return super().map_tensor_name(name, try_suffixes)


@ModelBase.register("KimiVLForConditionalGeneration")
class KimiVLModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.hparams_vision["image_size"] = 64 * 14 # for compatibility

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_projector_scale_factor(2)
# eps is the same as pytorch's default value
assert self.hparams_vision is not None
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5))

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name

if is_vision_tensor:
if "pos_emb.weight" in name:
data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2])
elif "wqkv" in name:
split_dim = 0 if "weight" in name else -1
wq, wk, wv = data_torch.chunk(3, dim=split_dim)
return [
(self.map_tensor_name(name.replace("wqkv", "wq")), wq),
(self.map_tensor_name(name.replace("wqkv", "wk")), wk),
(self.map_tensor_name(name.replace("wqkv", "wv")), wv)
]

return [(self.map_tensor_name(name), data_torch)]

return [] # skip other tensors

###### CONVERSION LOGIC ######


Expand Down
2 changes: 1 addition & 1 deletion docs/multimodal/minicpmv4.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model


### Build llama.cpp
Readme modification time: 20250206
Readme modification time: 20250731

If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)

Expand Down
47 changes: 47 additions & 0 deletions docs/multimodal/minicpmv4.5.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
## MiniCPM-V 4.5

### Prepare models and code

Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch model from huggingface to "MiniCPM-V-4_5" folder.


### Build llama.cpp
Readme modification time: 20250826

If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)

Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```

Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```


### Usage of MiniCPM-V 4

Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) by us)

```bash
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4_5
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4_5 --minicpmv-projector ../MiniCPM-V-4_5/minicpmv.projector --output-dir ../MiniCPM-V-4_5/ --minicpmv_version 6
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_5/model

# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-V-4_5/model/ggml-model-f16.gguf ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
```


Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"

# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf
```
39 changes: 34 additions & 5 deletions examples/model-conversion/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Validation functions
MAKEFLAGS += --no-print-directory

define validate_model_path
@if [ -z "$(MODEL_PATH)" ]; then \
echo "Error: MODEL_PATH must be provided either as:"; \
Expand All @@ -17,6 +18,13 @@ define validate_embedding_model_path
fi
endef

define quantize_model
@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
@echo "Export the quantized model path to $(2) variable in your environment"
endef

###
### Casual Model targets/recipes
###
Expand Down Expand Up @@ -67,9 +75,15 @@ causal-quantize-Q8_0: causal-quantize-model
causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
causal-quantize-Q4_0: causal-quantize-model

# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
# token embedding and output types to Q8_0 instead of the default Q6_K.
causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
causal-quantize-qat-Q4_0: causal-quantize-model

causal-quantize-model:
@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)

causal-run-quantized-model:
@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
Expand Down Expand Up @@ -117,9 +131,15 @@ embedding-quantize-Q8_0: embedding-quantize-model
embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
embedding-quantize-Q4_0: embedding-quantize-model

# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
# token embedding and output types to Q8_0 instead of the default Q6_K.
embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
embedding-quantize-qat-Q4_0: embedding-quantize-model

embedding-quantize-model:
@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)

embedding-run-quantized-model:
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
Expand All @@ -144,6 +164,15 @@ perplexity-run:
hf-create-model:
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}"

hf-create-model-dry-run:
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -d

hf-create-model-embedding:
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -e

hf-create-model-embedding-dry-run:
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -e -d

hf-create-model-private:
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -p

Expand Down
Loading