Skip to content

Commit 743681b

Browse files
authored
Merge branch 'ggml-org:master' into feature/nemotron-h-support-working
2 parents 3efbb74 + 4737327 commit 743681b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1759
-647
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
11061106
printf("\"\n\n");
11071107

11081108
printf(" case \"$prev\" in\n");
1109-
printf(" --model)\n");
1109+
printf(" --model|-m)\n");
11101110
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
11111111
printf(" return 0\n");
11121112
printf(" ;;\n");

convert_hf_to_gguf.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6254,9 +6254,11 @@ def prepare_tensors(self):
62546254
raise ValueError(f"Unprocessed experts: {experts}")
62556255

62566256

6257-
@ModelBase.register("DeepseekV2ForCausalLM")
6258-
@ModelBase.register("DeepseekV3ForCausalLM")
6259-
@ModelBase.register("KimiVLForConditionalGeneration")
6257+
@ModelBase.register(
6258+
"DeepseekV2ForCausalLM",
6259+
"DeepseekV3ForCausalLM",
6260+
"KimiVLForConditionalGeneration",
6261+
)
62606262
class DeepseekV2Model(TextModel):
62616263
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
62626264

@@ -8733,6 +8735,43 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
87338735
return "mm.2.weight"
87348736
return super().map_tensor_name(name, try_suffixes)
87358737

8738+
8739+
@ModelBase.register("KimiVLForConditionalGeneration")
8740+
class KimiVLModel(MmprojModel):
8741+
def __init__(self, *args, **kwargs):
8742+
super().__init__(*args, **kwargs)
8743+
assert self.hparams_vision is not None
8744+
self.hparams_vision["image_size"] = 64 * 14 # for compatibility
8745+
8746+
def set_gguf_parameters(self):
8747+
super().set_gguf_parameters()
8748+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL)
8749+
self.gguf_writer.add_vision_use_gelu(True)
8750+
self.gguf_writer.add_vision_projector_scale_factor(2)
8751+
# eps is the same as pytorch's default value
8752+
assert self.hparams_vision is not None
8753+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5))
8754+
8755+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8756+
del bid # unused
8757+
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
8758+
8759+
if is_vision_tensor:
8760+
if "pos_emb.weight" in name:
8761+
data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2])
8762+
elif "wqkv" in name:
8763+
split_dim = 0 if "weight" in name else -1
8764+
wq, wk, wv = data_torch.chunk(3, dim=split_dim)
8765+
return [
8766+
(self.map_tensor_name(name.replace("wqkv", "wq")), wq),
8767+
(self.map_tensor_name(name.replace("wqkv", "wk")), wk),
8768+
(self.map_tensor_name(name.replace("wqkv", "wv")), wv)
8769+
]
8770+
8771+
return [(self.map_tensor_name(name), data_torch)]
8772+
8773+
return [] # skip other tensors
8774+
87368775
###### CONVERSION LOGIC ######
87378776

87388777

docs/multimodal/minicpmv4.0.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model
66

77

88
### Build llama.cpp
9-
Readme modification time: 20250206
9+
Readme modification time: 20250731
1010

1111
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
1212

docs/multimodal/minicpmv4.5.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
## MiniCPM-V 4.5
2+
3+
### Prepare models and code
4+
5+
Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch model from huggingface to "MiniCPM-V-4_5" folder.
6+
7+
8+
### Build llama.cpp
9+
Readme modification time: 20250826
10+
11+
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
12+
13+
Clone llama.cpp:
14+
```bash
15+
git clone https://github.com/ggerganov/llama.cpp
16+
cd llama.cpp
17+
```
18+
19+
Build llama.cpp using `CMake`:
20+
```bash
21+
cmake -B build
22+
cmake --build build --config Release
23+
```
24+
25+
26+
### Usage of MiniCPM-V 4
27+
28+
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) by us)
29+
30+
```bash
31+
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4_5
32+
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4_5 --minicpmv-projector ../MiniCPM-V-4_5/minicpmv.projector --output-dir ../MiniCPM-V-4_5/ --minicpmv_version 6
33+
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_5/model
34+
35+
# quantize int4 version
36+
./build/bin/llama-quantize ../MiniCPM-V-4_5/model/ggml-model-f16.gguf ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
37+
```
38+
39+
40+
Inference on Linux or Mac
41+
```bash
42+
# run in single-turn mode
43+
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
44+
45+
# run in conversation mode
46+
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf
47+
```

examples/model-conversion/Makefile

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Validation functions
1+
MAKEFLAGS += --no-print-directory
2+
23
define validate_model_path
34
@if [ -z "$(MODEL_PATH)" ]; then \
45
echo "Error: MODEL_PATH must be provided either as:"; \
@@ -17,6 +18,13 @@ define validate_embedding_model_path
1718
fi
1819
endef
1920

21+
define quantize_model
22+
@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
23+
TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
24+
./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
25+
@echo "Export the quantized model path to $(2) variable in your environment"
26+
endef
27+
2028
###
2129
### Casual Model targets/recipes
2230
###
@@ -67,9 +75,15 @@ causal-quantize-Q8_0: causal-quantize-model
6775
causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
6876
causal-quantize-Q4_0: causal-quantize-model
6977

78+
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
79+
# token embedding and output types to Q8_0 instead of the default Q6_K.
80+
causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
81+
causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
82+
causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
83+
causal-quantize-qat-Q4_0: causal-quantize-model
84+
7085
causal-quantize-model:
71-
@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
72-
@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
86+
$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)
7387

7488
causal-run-quantized-model:
7589
@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
@@ -117,9 +131,15 @@ embedding-quantize-Q8_0: embedding-quantize-model
117131
embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
118132
embedding-quantize-Q4_0: embedding-quantize-model
119133

134+
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
135+
# token embedding and output types to Q8_0 instead of the default Q6_K.
136+
embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
137+
embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
138+
embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
139+
embedding-quantize-qat-Q4_0: embedding-quantize-model
140+
120141
embedding-quantize-model:
121-
@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
122-
@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
142+
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
123143

124144
embedding-run-quantized-model:
125145
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}

examples/model-conversion/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,18 @@ Then the quantized model can be run using the following command:
137137
(venv) $ make causal-run-quantized-model
138138
```
139139

140+
### Quantizing QAT (Quantization Aware Training) models
141+
When quantizing to `Q4_0`, the default data type for the token embedding weights
142+
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
143+
recommended to use `Q8_0` instead for the embeddings and output tensors.
144+
The reason is that although `Q6_K` is smaller in size, it requires more compute
145+
to unpack, which can hurt performance during output generation when the entire
146+
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
147+
provides practically full quality with better computational efficiency.
148+
```console
149+
(venv) $ make causal-quantize-qat-Q4_0
150+
```
151+
140152

141153
## Embedding Language Model Conversion
142154

@@ -238,6 +250,18 @@ Then the quantized model can be run using the following command:
238250
(venv) $ make embedding-run-quantized-model
239251
```
240252

253+
### Quantizing QAT (Quantization Aware Training) models
254+
When quantizing to `Q4_0`, the default data type for the token embedding weights
255+
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
256+
recommended to use `Q8_0` instead for the embeddings and output tensors.
257+
The reason is that although `Q6_K` is smaller in size, it requires more compute
258+
to unpack, which can hurt performance during output generation when the entire
259+
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
260+
provides practically full quality with better computational efficiency.
261+
```console
262+
(venv) $ make embedding-quantize-qat-Q4_0
263+
```
264+
241265
## Perplexity Evaluation
242266

243267
### Simple perplexity evaluation

examples/model-conversion/scripts/utils/quantize.sh

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ set -e
44

55
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
66
QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
7+
TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
8+
OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
79
QUANTIZED_MODEL=$CONVERTED_MODEL
810

911
# Final check if we have a model path
@@ -14,6 +16,11 @@ if [ -z "$CONVERTED_MODEL" ]; then
1416
exit 1
1517
fi
1618

19+
if [ -z "$QUANTIZED_TYPE" ]; then
20+
echo "Error: QUANTIZED_TYPE is required" >&2
21+
exit 1
22+
fi
23+
1724
echo $CONVERTED_MODEL
1825

1926
# Process the quantized model filename
@@ -26,9 +33,16 @@ else
2633
exit 1
2734
fi
2835

29-
3036
cmake --build ../../build --target llama-quantize -j8
3137

32-
../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE
38+
echo $TOKEN_EMBD_TYPE
39+
echo $OUTPUT_TYPE
40+
41+
CMD_ARGS=("../../build/bin/llama-quantize")
42+
[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
43+
[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
44+
CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
45+
46+
"${CMD_ARGS[@]}"
3347

3448
echo "Quantized model saved to: $QUANTIZED_MODEL"

0 commit comments

Comments
 (0)