Skip to content

Commit f17dedc

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 083e363 + 44b1efa commit f17dedc

File tree

17 files changed

+712
-219
lines changed

17 files changed

+712
-219
lines changed

convert_hf_to_gguf.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6254,9 +6254,11 @@ def prepare_tensors(self):
62546254
raise ValueError(f"Unprocessed experts: {experts}")
62556255

62566256

6257-
@ModelBase.register("DeepseekV2ForCausalLM")
6258-
@ModelBase.register("DeepseekV3ForCausalLM")
6259-
@ModelBase.register("KimiVLForConditionalGeneration")
6257+
@ModelBase.register(
6258+
"DeepseekV2ForCausalLM",
6259+
"DeepseekV3ForCausalLM",
6260+
"KimiVLForConditionalGeneration",
6261+
)
62606262
class DeepseekV2Model(TextModel):
62616263
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
62626264

@@ -8507,6 +8509,43 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
85078509
return "mm.2.weight"
85088510
return super().map_tensor_name(name, try_suffixes)
85098511

8512+
8513+
@ModelBase.register("KimiVLForConditionalGeneration")
8514+
class KimiVLModel(MmprojModel):
8515+
def __init__(self, *args, **kwargs):
8516+
super().__init__(*args, **kwargs)
8517+
assert self.hparams_vision is not None
8518+
self.hparams_vision["image_size"] = 64 * 14 # for compatibility
8519+
8520+
def set_gguf_parameters(self):
8521+
super().set_gguf_parameters()
8522+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL)
8523+
self.gguf_writer.add_vision_use_gelu(True)
8524+
self.gguf_writer.add_vision_projector_scale_factor(2)
8525+
# eps is the same as pytorch's default value
8526+
assert self.hparams_vision is not None
8527+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5))
8528+
8529+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8530+
del bid # unused
8531+
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
8532+
8533+
if is_vision_tensor:
8534+
if "pos_emb.weight" in name:
8535+
data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2])
8536+
elif "wqkv" in name:
8537+
split_dim = 0 if "weight" in name else -1
8538+
wq, wk, wv = data_torch.chunk(3, dim=split_dim)
8539+
return [
8540+
(self.map_tensor_name(name.replace("wqkv", "wq")), wq),
8541+
(self.map_tensor_name(name.replace("wqkv", "wk")), wk),
8542+
(self.map_tensor_name(name.replace("wqkv", "wv")), wv)
8543+
]
8544+
8545+
return [(self.map_tensor_name(name), data_torch)]
8546+
8547+
return [] # skip other tensors
8548+
85108549
###### CONVERSION LOGIC ######
85118550

85128551

examples/model-conversion/Makefile

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Validation functions
1+
MAKEFLAGS += --no-print-directory
2+
23
define validate_model_path
34
@if [ -z "$(MODEL_PATH)" ]; then \
45
echo "Error: MODEL_PATH must be provided either as:"; \
@@ -17,6 +18,13 @@ define validate_embedding_model_path
1718
fi
1819
endef
1920

21+
define quantize_model
22+
@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
23+
TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
24+
./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
25+
@echo "Export the quantized model path to $(2) variable in your environment"
26+
endef
27+
2028
###
2129
### Casual Model targets/recipes
2230
###
@@ -67,9 +75,15 @@ causal-quantize-Q8_0: causal-quantize-model
6775
causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
6876
causal-quantize-Q4_0: causal-quantize-model
6977

78+
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
79+
# token embedding and output types to Q8_0 instead of the default Q6_K.
80+
causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
81+
causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
82+
causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
83+
causal-quantize-qat-Q4_0: causal-quantize-model
84+
7085
causal-quantize-model:
71-
@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
72-
@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
86+
$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)
7387

7488
causal-run-quantized-model:
7589
@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
@@ -117,9 +131,15 @@ embedding-quantize-Q8_0: embedding-quantize-model
117131
embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
118132
embedding-quantize-Q4_0: embedding-quantize-model
119133

134+
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
135+
# token embedding and output types to Q8_0 instead of the default Q6_K.
136+
embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
137+
embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
138+
embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
139+
embedding-quantize-qat-Q4_0: embedding-quantize-model
140+
120141
embedding-quantize-model:
121-
@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
122-
@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
142+
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
123143

124144
embedding-run-quantized-model:
125145
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}

examples/model-conversion/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,18 @@ Then the quantized model can be run using the following command:
137137
(venv) $ make causal-run-quantized-model
138138
```
139139

140+
### Quantizing QAT (Quantization Aware Training) models
141+
When quantizing to `Q4_0`, the default data type for the token embedding weights
142+
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
143+
recommended to use `Q8_0` instead for the embeddings and output tensors.
144+
The reason is that although `Q6_K` is smaller in size, it requires more compute
145+
to unpack, which can hurt performance during output generation when the entire
146+
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
147+
provides practically full quality with better computational efficiency.
148+
```console
149+
(venv) $ make causal-quantize-qat-Q4_0
150+
```
151+
140152

141153
## Embedding Language Model Conversion
142154

@@ -238,6 +250,18 @@ Then the quantized model can be run using the following command:
238250
(venv) $ make embedding-run-quantized-model
239251
```
240252

253+
### Quantizing QAT (Quantization Aware Training) models
254+
When quantizing to `Q4_0`, the default data type for the token embedding weights
255+
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
256+
recommended to use `Q8_0` instead for the embeddings and output tensors.
257+
The reason is that although `Q6_K` is smaller in size, it requires more compute
258+
to unpack, which can hurt performance during output generation when the entire
259+
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
260+
provides practically full quality with better computational efficiency.
261+
```console
262+
(venv) $ make embedding-quantize-qat-Q4_0
263+
```
264+
241265
## Perplexity Evaluation
242266

243267
### Simple perplexity evaluation

examples/model-conversion/scripts/utils/quantize.sh

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ set -e
44

55
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
66
QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
7+
TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
8+
OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
79
QUANTIZED_MODEL=$CONVERTED_MODEL
810

911
# Final check if we have a model path
@@ -14,6 +16,11 @@ if [ -z "$CONVERTED_MODEL" ]; then
1416
exit 1
1517
fi
1618

19+
if [ -z "$QUANTIZED_TYPE" ]; then
20+
echo "Error: QUANTIZED_TYPE is required" >&2
21+
exit 1
22+
fi
23+
1724
echo $CONVERTED_MODEL
1825

1926
# Process the quantized model filename
@@ -26,9 +33,16 @@ else
2633
exit 1
2734
fi
2835

29-
3036
cmake --build ../../build --target llama-quantize -j8
3137

32-
../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE
38+
echo $TOKEN_EMBD_TYPE
39+
echo $OUTPUT_TYPE
40+
41+
CMD_ARGS=("../../build/bin/llama-quantize")
42+
[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
43+
[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
44+
CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
45+
46+
"${CMD_ARGS[@]}"
3347

3448
echo "Quantized model saved to: $QUANTIZED_MODEL"

0 commit comments

Comments
 (0)