Skip to content

Commit e0ff1ef

Browse files
committed
Update qkv to int8 and other improvements for better MMLU score
1 parent b861a74 commit e0ff1ef

File tree

7 files changed

+263
-57
lines changed

7 files changed

+263
-57
lines changed

google-gemma-Gemma3-4B/qnn/gemma-3-4b.ipynb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,15 @@
155155
"This is needed for running the Olive recipies for this model"
156156
]
157157
},
158+
{
159+
"cell_type": "code",
160+
"execution_count": null,
161+
"metadata": {},
162+
"outputs": [],
163+
"source": [
164+
"!patch ./olive_venv/lib/python3.10/site-packages/gptqmodel/utils/model.py < gptqmodel_int8.patch"
165+
]
166+
},
158167
{
159168
"cell_type": "code",
160169
"execution_count": null,
@@ -177,6 +186,13 @@
177186
" print(f\"Downloaded and replaced: {dest}\")"
178187
]
179188
},
189+
{
190+
"cell_type": "code",
191+
"execution_count": null,
192+
"metadata": {},
193+
"outputs": [],
194+
"source": []
195+
},
180196
{
181197
"cell_type": "markdown",
182198
"metadata": {},

google-gemma-Gemma3-4B/qnn/gemma3-4b-embedding-qnn-config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"input_names": [ "input_ids", "image_features" ],
88
"input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ],
99
"input_types": [ "int64", "float32" ],
10-
"output_names": [ "/model/embed_tokens/Mul/output_0" ],
10+
"output_names": [ "/model/embed_tokens/Mul_output_cast_0" ],
1111
"output_shapes": [ [ 1, 64, 2560 ] ],
1212
"dynamic_axes": {
1313
"input_ids": { "0": "batch_size", "1": "seq_length" },

google-gemma-Gemma3-4B/qnn/gemma3-4b-text-qnn-config.json

Lines changed: 114 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,62 +9,150 @@
99
"qnn_system": {
1010
"type": "PythonEnvironment",
1111
"python_environment_path": "",
12-
"accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
12+
"accelerators": [
13+
{
14+
"execution_providers": [
15+
"QNNExecutionProvider"
16+
]
17+
}
18+
]
1319
}
1420
},
1521
"data_configs": [
1622
{
17-
"name": "gemma_data_config",
18-
"user_script": "custom_gemma3_4b_datasets.py",
19-
"load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" }
23+
"name": "wikitext2_train_joined",
24+
"type": "HuggingfaceContainer",
25+
"load_dataset_config": {
26+
"data_name": "wikitext",
27+
"subset": "wikitext-2-raw-v1",
28+
"split": "train"
29+
},
30+
"pre_process_data_config": {
31+
"strategy": "join",
32+
"add_special_tokens": false,
33+
"max_seq_len": 4096,
34+
"max_samples": 128
35+
}
36+
},
37+
{
38+
"name": "wikitext2_train_act",
39+
"type": "HuggingfaceContainer",
40+
"load_dataset_config": {
41+
"data_name": "wikitext",
42+
"subset": "wikitext-2-raw-v1",
43+
"split": "train"
44+
},
45+
"pre_process_data_config": {
46+
"strategy": "line-by-line",
47+
"add_special_tokens": true,
48+
"max_samples": 200,
49+
"max_seq_len": 4096
50+
}
2051
}
2152
],
2253
"passes": {
54+
"cs": {
55+
"type": "CaptureSplitInfo",
56+
"num_splits": 2,
57+
"unique_embeds_lm_head_splits": true
58+
},
2359
"g": {
2460
"type": "GptqModel",
2561
"bits": 4,
2662
"sym": true,
2763
"group_size": -1,
2864
"lm_head": false,
2965
"device": "cuda",
30-
"data_config": "gemma_data_config"
66+
"data_config": "wikitext2_train_joined",
67+
"dynamic": {
68+
"+:.*v_proj*": {
69+
"bits": 8,
70+
"sym": true,
71+
"group_size": -1,
72+
"desc_act": true
73+
},
74+
"+:.*k_proj*": {
75+
"bits": 8,
76+
"sym": true,
77+
"group_size": -1,
78+
"desc_act": true
79+
},
80+
"+:.*q_proj*": {
81+
"bits": 8,
82+
"sym": true,
83+
"group_size": -1,
84+
"desc_act": true
85+
}
86+
}
3187
},
32-
"cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
3388
"mb": {
3489
"type": "ModelBuilder",
3590
"precision": "int4",
36-
"int4_block_size": 32,
91+
"int4_block_size": 16,
3792
"int4_accuracy_level": 4,
38-
"int4_op_types_to_quantize": [ "MatMul", "Gather" ]
93+
"int4_op_types_to_quantize": ["Gather", "MatMul"]
3994
},
4095
"mq": {
4196
"type": "MatMulNBitsToQDQ",
4297
"use_int4": true,
4398
"add_zero_point": true,
44-
"nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
99+
"nodes_to_exclude": [
100+
"/lm_head/MatMul_Q4"
101+
],
45102
"save_as_external_data": true
46103
},
47104
"gs": {
48105
"type": "GraphSurgeries",
49106
"surgeries": [
50-
{ "surgeon": "RemoveRopeMultiCache" },
51-
{ "surgeon": "AttentionMaskToSequenceLengths" },
52-
{ "surgeon": "SimplifiedLayerNormToL2Norm" }
107+
{
108+
"surgeon": "RemoveRopeMultiCache"
109+
},
110+
{
111+
"surgeon": "AttentionMaskToSequenceLengths"
112+
},
113+
{
114+
"surgeon": "SimplifiedLayerNormToL2Norm"
115+
}
116+
],
117+
"save_as_external_data": true
118+
},
119+
"f16": {
120+
"type": "OnnxFloatToFloat16",
121+
"op_include_list": [
122+
"GroupQueryAttention"
123+
],
124+
"keep_io_types": [
125+
"logits"
53126
],
54127
"save_as_external_data": true
55128
},
56129
"sq": {
57130
"type": "OnnxStaticQuantization",
58-
"data_config": "gemma_data_config",
131+
"data_config": "wikitext2_train_act",
59132
"activation_type": "uint16",
60133
"precision": "uint8",
61-
"calibration_providers": [ "CUDAExecutionProvider" ],
134+
"calibration_providers": [
135+
"CUDAExecutionProvider"
136+
],
62137
"quant_preprocess": true,
63-
"op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
64-
"save_as_external_data": true
138+
"op_types_to_exclude": [
139+
"GatherBlockQuantized",
140+
"GroupQueryAttention",
141+
"MatMulNBits"
142+
],
143+
"save_as_external_data": true,
144+
"extra_option": {
145+
"CalibStridedMinMax": 4
146+
}
147+
},
148+
"sp": {
149+
"type": "SplitModel"
150+
},
151+
"st": {
152+
"type": "StaticLLM",
153+
"batch_size": 1,
154+
"context_length": 64
65155
},
66-
"sp": { "type": "SplitModel" },
67-
"st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
68156
"cb": {
69157
"type": "EPContextBinaryGenerator",
70158
"provider_options": {
@@ -74,14 +162,19 @@
74162
"htp_arch": "v73",
75163
"soc_model": "60"
76164
},
77-
"session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
165+
"session_options": {
166+
"intra_op_num_threads": 2,
167+
"inter_op_num_threads": 1
168+
},
78169
"weight_sharing": true
79170
},
80-
"cp": { "type": "ComposeOnnxModels" }
171+
"cp": {
172+
"type": "ComposeOnnxModels"
173+
}
81174
},
82175
"target": "qnn_system",
83176
"log_severity_level": 0,
84177
"output_dir": "models/gemma3_qnn",
85178
"cache_dir": "cache",
86179
"no_artifacts": true
87-
}
180+
}

google-gemma-Gemma3-4B/qnn/gemma3-4b-vision-qnn-config.json

Lines changed: 55 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,38 +4,78 @@
44
"model_script": "custom_gemma3_4b_vision.py",
55
"model_loader": "load_gemma3_vision_model",
66
"io_config": {
7-
"input_names": [ "pixel_values" ],
8-
"input_shapes": [ [ 1, 3, 896, 896 ] ],
9-
"input_types": [ "float32" ],
10-
"output_names": [ "image_features" ],
11-
"output_shapes": [ [ 1, 256, 2560 ] ]
7+
"input_names": [
8+
"pixel_values"
9+
],
10+
"input_shapes": [
11+
[
12+
1,
13+
3,
14+
896,
15+
896
16+
]
17+
],
18+
"input_types": [
19+
"float32"
20+
],
21+
"output_names": [
22+
"image_features"
23+
],
24+
"output_shapes": [
25+
[
26+
1,
27+
256,
28+
2560
29+
]
30+
]
1231
}
1332
},
1433
"systems": {
1534
"qnn_system": {
1635
"type": "PythonEnvironment",
1736
"python_environment_path": "",
18-
"accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
37+
"accelerators": [
38+
{
39+
"execution_providers": [
40+
"QNNExecutionProvider"
41+
]
42+
}
43+
]
1944
}
2045
},
2146
"data_configs": [
2247
{
2348
"name": "gemma_vision_data_config",
2449
"user_script": "custom_gemma3_4b_datasets.py",
25-
"load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" }
50+
"load_dataset_config": {
51+
"type": "gemma_image_dataset",
52+
"model_id": "google/gemma-3-4b-it"
53+
}
2654
}
2755
],
2856
"passes": {
29-
"conversion": { "type": "OnnxConversion", "target_opset": 20 },
30-
"surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] },
57+
"conversion": {
58+
"type": "OnnxConversion",
59+
"target_opset": 20
60+
},
61+
"surgery": {
62+
"type": "GraphSurgeries",
63+
"surgeries": [
64+
{
65+
"surgeon": "MatMulAddToGemm"
66+
}
67+
]
68+
},
3169
"quantization": {
3270
"type": "OnnxStaticQuantization",
3371
"quant_preprocess": true,
3472
"data_config": "gemma_vision_data_config",
3573
"activation_type": "uint16",
3674
"precision": "uint8",
3775
"calibrate_method": "MinMax",
38-
"calibration_providers": [ "CUDAExecutionProvider" ],
76+
"calibration_providers": [
77+
"CUDAExecutionProvider"
78+
],
3979
"per_channel": true,
4080
"weight_symmetric": true
4181
},
@@ -49,11 +89,14 @@
4989
"soc_model": "60"
5090
}
5191
},
52-
"add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
92+
"add_metadata": {
93+
"type": "AddOliveMetadata",
94+
"graph_name": "gemma-3-4b-it-vision"
95+
}
5396
},
5497
"target": "qnn_system",
5598
"log_severity_level": 1,
5699
"output_dir": "models/gemma-3-4b-it-vision",
57100
"cache_dir": "cache-vision",
58101
"no_artifacts": true
59-
}
102+
}

0 commit comments

Comments
 (0)