Skip to content

Commit 53d7b0e

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents e3bc1eb + 55042b3 commit 53d7b0e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+3211
-1609
lines changed

common/arg.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
11061106
printf("\"\n\n");
11071107

11081108
printf(" case \"$prev\" in\n");
1109-
printf(" --model)\n");
1109+
printf(" --model|-m)\n");
11101110
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
11111111
printf(" return 0\n");
11121112
printf(" ;;\n");
@@ -3538,6 +3538,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35383538
}
35393539
).set_examples({LLAMA_EXAMPLE_SERVER}));
35403540

3541+
add_opt(common_arg(
3542+
{"--fim-qwen-30b-default"},
3543+
string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3544+
[](common_params & params) {
3545+
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3546+
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3547+
params.port = 8012;
3548+
params.n_gpu_layers = 99;
3549+
params.flash_attn = true;
3550+
params.n_ubatch = 1024;
3551+
params.n_batch = 1024;
3552+
params.n_ctx = 0;
3553+
params.n_cache_reuse = 256;
3554+
}
3555+
).set_examples({LLAMA_EXAMPLE_SERVER}));
3556+
35413557
add_opt(common_arg(
35423558
{ "--diffusion-steps" }, "N",
35433559
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),

convert_hf_to_gguf.py

Lines changed: 108 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,6 +1226,55 @@ def _try_set_pooling_type(self) -> None:
12261226
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
12271227
self.gguf_writer.add_pooling_type(pooling_type)
12281228

1229+
def _set_vocab_interns1(self):
1230+
tokens: list[str] = []
1231+
toktypes: list[int] = []
1232+
1233+
from transformers import AutoTokenizer
1234+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
1235+
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
1236+
vocab_size = self.hparams.get("vocab_size", len(vocab))
1237+
assert max(vocab.values()) < vocab_size
1238+
1239+
tokpre = self.get_vocab_base_pre(tokenizer)
1240+
1241+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
1242+
added_vocab = tokenizer.get_added_vocab()
1243+
1244+
added_tokens_decoder = tokenizer.added_tokens_decoder
1245+
1246+
for i in range(vocab_size):
1247+
if i not in reverse_vocab:
1248+
tokens.append(f"[PAD{i}]")
1249+
toktypes.append(gguf.TokenType.UNUSED)
1250+
else:
1251+
token: str = reverse_vocab[i]
1252+
if token in added_vocab:
1253+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
1254+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
1255+
if not added_tokens_decoder[i].normalized:
1256+
previous_token = token
1257+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
1258+
if previous_token != token:
1259+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
1260+
1261+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
1262+
toktypes.append(gguf.TokenType.CONTROL)
1263+
else:
1264+
toktypes.append(gguf.TokenType.USER_DEFINED)
1265+
else:
1266+
toktypes.append(gguf.TokenType.NORMAL)
1267+
tokens.append(token)
1268+
1269+
self.gguf_writer.add_tokenizer_model("gpt2")
1270+
self.gguf_writer.add_tokenizer_pre(tokpre)
1271+
self.gguf_writer.add_token_list(tokens)
1272+
self.gguf_writer.add_token_types(toktypes)
1273+
1274+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1275+
special_vocab._set_special_token("bos", 151643)
1276+
special_vocab.add_to_gguf(self.gguf_writer)
1277+
12291278

12301279
class MmprojModel(ModelBase):
12311280
model_type = ModelType.MMPROJ
@@ -2942,7 +2991,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
29422991
if "language_model." in name:
29432992
name = name.replace("language_model.", "") # for InternVL
29442993
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
2945-
or name.startswith("vision_model") or name.startswith("audio_tower"):
2994+
or name.startswith("vision_model") or name.startswith("audio_tower") \
2995+
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
29462996
# skip vision and audio tensors
29472997
return []
29482998
yield from super().modify_tensors(data_torch, name, bid)
@@ -3119,7 +3169,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
31193169
yield from super().modify_tensors(data_torch, name, bid)
31203170

31213171

3122-
@ModelBase.register("Ernie4_5_ForCausalLM")
3172+
@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM")
31233173
class Ernie4_5Model(TextModel):
31243174
model_arch = gguf.MODEL_ARCH.ERNIE4_5
31253175

@@ -3614,6 +3664,19 @@ def prepare_tensors(self):
36143664
class Qwen3Model(Qwen2Model):
36153665
model_arch = gguf.MODEL_ARCH.QWEN3
36163666

3667+
def __init__(self, *args, **kwargs):
3668+
super().__init__(*args, **kwargs)
3669+
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
3670+
self.origin_hf_arch = hparams.get('architectures', [None])[0]
3671+
3672+
def set_vocab(self):
3673+
# deal with intern-s1-mini
3674+
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
3675+
self._set_vocab_interns1()
3676+
return
3677+
3678+
super().set_vocab()
3679+
36173680

36183681
@ModelBase.register("Qwen3MoeForCausalLM")
36193682
class Qwen3MoeModel(Qwen2MoeModel):
@@ -3630,73 +3693,7 @@ def set_vocab(self):
36303693
self._set_vocab_interns1()
36313694
return
36323695

3633-
try:
3634-
self._set_vocab_sentencepiece()
3635-
except FileNotFoundError:
3636-
self._set_vocab_gpt2()
3637-
3638-
def _set_vocab_interns1(self):
3639-
tokens: list[str] = []
3640-
toktypes: list[int] = []
3641-
3642-
from transformers import AutoTokenizer
3643-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3644-
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
3645-
vocab_size = self.hparams.get("vocab_size", len(vocab))
3646-
assert max(vocab.values()) < vocab_size
3647-
3648-
tokpre = self.get_vocab_base_pre(tokenizer)
3649-
3650-
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
3651-
added_vocab = tokenizer.get_added_vocab()
3652-
3653-
added_tokens_decoder = tokenizer.added_tokens_decoder
3654-
3655-
for i in range(vocab_size):
3656-
if i not in reverse_vocab:
3657-
tokens.append(f"[PAD{i}]")
3658-
toktypes.append(gguf.TokenType.UNUSED)
3659-
else:
3660-
token: str = reverse_vocab[i]
3661-
if token in added_vocab:
3662-
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3663-
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
3664-
if not added_tokens_decoder[i].normalized:
3665-
previous_token = token
3666-
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
3667-
if previous_token != token:
3668-
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
3669-
3670-
if added_tokens_decoder[i].special or self.does_token_look_special(token):
3671-
toktypes.append(gguf.TokenType.CONTROL)
3672-
else:
3673-
toktypes.append(gguf.TokenType.USER_DEFINED)
3674-
else:
3675-
toktypes.append(gguf.TokenType.NORMAL)
3676-
tokens.append(token)
3677-
3678-
self.gguf_writer.add_tokenizer_model("gpt2")
3679-
self.gguf_writer.add_tokenizer_pre(tokpre)
3680-
self.gguf_writer.add_token_list(tokens)
3681-
self.gguf_writer.add_token_types(toktypes)
3682-
3683-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3684-
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
3685-
additional_special_tokens = []
3686-
if special_tokens_map_file.is_file():
3687-
with open(special_tokens_map_file, encoding = 'utf-8') as f:
3688-
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
3689-
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
3690-
if tokenizer_cfg_file.is_file():
3691-
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
3692-
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
3693-
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
3694-
for token in additional_special_tokens:
3695-
if token in token2ids_map:
3696-
special_vocab._set_special_token(token, token2ids_map[token])
3697-
special_vocab._set_special_token('eos', 151645)
3698-
special_vocab._set_special_token("bos", 151643)
3699-
special_vocab.add_to_gguf(self.gguf_writer)
3696+
super().set_vocab()
37003697

37013698

37023699
@ModelBase.register("GPT2LMHeadModel")
@@ -6267,9 +6264,11 @@ def prepare_tensors(self):
62676264
raise ValueError(f"Unprocessed experts: {experts}")
62686265

62696266

6270-
@ModelBase.register("DeepseekV2ForCausalLM")
6271-
@ModelBase.register("DeepseekV3ForCausalLM")
6272-
@ModelBase.register("KimiVLForConditionalGeneration")
6267+
@ModelBase.register(
6268+
"DeepseekV2ForCausalLM",
6269+
"DeepseekV3ForCausalLM",
6270+
"KimiVLForConditionalGeneration",
6271+
)
62736272
class DeepseekV2Model(TextModel):
62746273
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
62756274

@@ -8520,6 +8519,43 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
85208519
return "mm.2.weight"
85218520
return super().map_tensor_name(name, try_suffixes)
85228521

8522+
8523+
@ModelBase.register("KimiVLForConditionalGeneration")
8524+
class KimiVLModel(MmprojModel):
8525+
def __init__(self, *args, **kwargs):
8526+
super().__init__(*args, **kwargs)
8527+
assert self.hparams_vision is not None
8528+
self.hparams_vision["image_size"] = 64 * 14 # for compatibility
8529+
8530+
def set_gguf_parameters(self):
8531+
super().set_gguf_parameters()
8532+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL)
8533+
self.gguf_writer.add_vision_use_gelu(True)
8534+
self.gguf_writer.add_vision_projector_scale_factor(2)
8535+
# eps is the same as pytorch's default value
8536+
assert self.hparams_vision is not None
8537+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5))
8538+
8539+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8540+
del bid # unused
8541+
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
8542+
8543+
if is_vision_tensor:
8544+
if "pos_emb.weight" in name:
8545+
data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2])
8546+
elif "wqkv" in name:
8547+
split_dim = 0 if "weight" in name else -1
8548+
wq, wk, wv = data_torch.chunk(3, dim=split_dim)
8549+
return [
8550+
(self.map_tensor_name(name.replace("wqkv", "wq")), wq),
8551+
(self.map_tensor_name(name.replace("wqkv", "wk")), wk),
8552+
(self.map_tensor_name(name.replace("wqkv", "wv")), wv)
8553+
]
8554+
8555+
return [(self.map_tensor_name(name), data_torch)]
8556+
8557+
return [] # skip other tensors
8558+
85238559
###### CONVERSION LOGIC ######
85248560

85258561

docs/multimodal/minicpmv4.0.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model
66

77

88
### Build llama.cpp
9-
Readme modification time: 20250206
9+
Readme modification time: 20250731
1010

1111
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
1212

docs/multimodal/minicpmv4.5.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
## MiniCPM-V 4.5
2+
3+
### Prepare models and code
4+
5+
Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch model from huggingface to "MiniCPM-V-4_5" folder.
6+
7+
8+
### Build llama.cpp
9+
Readme modification time: 20250826
10+
11+
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
12+
13+
Clone llama.cpp:
14+
```bash
15+
git clone https://github.com/ggerganov/llama.cpp
16+
cd llama.cpp
17+
```
18+
19+
Build llama.cpp using `CMake`:
20+
```bash
21+
cmake -B build
22+
cmake --build build --config Release
23+
```
24+
25+
26+
### Usage of MiniCPM-V 4
27+
28+
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) by us)
29+
30+
```bash
31+
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4_5
32+
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4_5 --minicpmv-projector ../MiniCPM-V-4_5/minicpmv.projector --output-dir ../MiniCPM-V-4_5/ --minicpmv_version 6
33+
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_5/model
34+
35+
# quantize int4 version
36+
./build/bin/llama-quantize ../MiniCPM-V-4_5/model/ggml-model-f16.gguf ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
37+
```
38+
39+
40+
Inference on Linux or Mac
41+
```bash
42+
# run in single-turn mode
43+
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
44+
45+
# run in conversation mode
46+
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf
47+
```

0 commit comments

Comments
 (0)