Skip to content

Commit 62cef26

Browse files
authored
model-conversion : add qat-q4 quantization targets (ggml-org#15588)
This commit adds two targets to the Makefile for quantizing of Quantization Aware Trained (QAT) models to Q4_0 format. The motivation for this is that this sets the token embedding and the output tensors data types to Q8_0 instead of the default Q6_K. This is someting that we wish to enforce for QAT Q4_0 models that are to be uploaded to ggml-org on Huggingface to guarantee the best quality.
1 parent 8f5afa9 commit 62cef26

File tree

3 files changed

+65
-7
lines changed

3 files changed

+65
-7
lines changed

examples/model-conversion/Makefile

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Validation functions
1+
MAKEFLAGS += --no-print-directory
2+
23
define validate_model_path
34
@if [ -z "$(MODEL_PATH)" ]; then \
45
echo "Error: MODEL_PATH must be provided either as:"; \
@@ -17,6 +18,13 @@ define validate_embedding_model_path
1718
fi
1819
endef
1920

21+
define quantize_model
22+
@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
23+
TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
24+
./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
25+
@echo "Export the quantized model path to $(2) variable in your environment"
26+
endef
27+
2028
###
2129
### Casual Model targets/recipes
2230
###
@@ -67,9 +75,15 @@ causal-quantize-Q8_0: causal-quantize-model
6775
causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
6876
causal-quantize-Q4_0: causal-quantize-model
6977

78+
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
79+
# token embedding and output types to Q8_0 instead of the default Q6_K.
80+
causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
81+
causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
82+
causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
83+
causal-quantize-qat-Q4_0: causal-quantize-model
84+
7085
causal-quantize-model:
71-
@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
72-
@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
86+
$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)
7387

7488
causal-run-quantized-model:
7589
@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
@@ -117,9 +131,15 @@ embedding-quantize-Q8_0: embedding-quantize-model
117131
embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
118132
embedding-quantize-Q4_0: embedding-quantize-model
119133

134+
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
135+
# token embedding and output types to Q8_0 instead of the default Q6_K.
136+
embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
137+
embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
138+
embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
139+
embedding-quantize-qat-Q4_0: embedding-quantize-model
140+
120141
embedding-quantize-model:
121-
@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
122-
@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
142+
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
123143

124144
embedding-run-quantized-model:
125145
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}

examples/model-conversion/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,18 @@ Then the quantized model can be run using the following command:
137137
(venv) $ make causal-run-quantized-model
138138
```
139139

140+
### Quantizing QAT (Quantization Aware Training) models
141+
When quantizing to `Q4_0`, the default data type for the token embedding weights
142+
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
143+
recommended to use `Q8_0` instead for the embeddings and output tensors.
144+
The reason is that although `Q6_K` is smaller in size, it requires more compute
145+
to unpack, which can hurt performance during output generation when the entire
146+
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
147+
provides practically full quality with better computational efficiency.
148+
```console
149+
(venv) $ make causal-quantize-qat-Q4_0
150+
```
151+
140152

141153
## Embedding Language Model Conversion
142154

@@ -238,6 +250,18 @@ Then the quantized model can be run using the following command:
238250
(venv) $ make embedding-run-quantized-model
239251
```
240252

253+
### Quantizing QAT (Quantization Aware Training) models
254+
When quantizing to `Q4_0`, the default data type for the token embedding weights
255+
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
256+
recommended to use `Q8_0` instead for the embeddings and output tensors.
257+
The reason is that although `Q6_K` is smaller in size, it requires more compute
258+
to unpack, which can hurt performance during output generation when the entire
259+
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
260+
provides practically full quality with better computational efficiency.
261+
```console
262+
(venv) $ make embedding-quantize-qat-Q4_0
263+
```
264+
241265
## Perplexity Evaluation
242266

243267
### Simple perplexity evaluation

examples/model-conversion/scripts/utils/quantize.sh

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ set -e
44

55
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
66
QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
7+
TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
8+
OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
79
QUANTIZED_MODEL=$CONVERTED_MODEL
810

911
# Final check if we have a model path
@@ -14,6 +16,11 @@ if [ -z "$CONVERTED_MODEL" ]; then
1416
exit 1
1517
fi
1618

19+
if [ -z "$QUANTIZED_TYPE" ]; then
20+
echo "Error: QUANTIZED_TYPE is required" >&2
21+
exit 1
22+
fi
23+
1724
echo $CONVERTED_MODEL
1825

1926
# Process the quantized model filename
@@ -26,9 +33,16 @@ else
2633
exit 1
2734
fi
2835

29-
3036
cmake --build ../../build --target llama-quantize -j8
3137

32-
../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE
38+
echo $TOKEN_EMBD_TYPE
39+
echo $OUTPUT_TYPE
40+
41+
CMD_ARGS=("../../build/bin/llama-quantize")
42+
[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
43+
[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
44+
CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
45+
46+
"${CMD_ARGS[@]}"
3347

3448
echo "Quantized model saved to: $QUANTIZED_MODEL"

0 commit comments

Comments
 (0)