Skip to content

Commit caa9965

Browse files
committed
[5506930]Add support in ModelOpt for generating mixed-precision (INT4+INT8) ONNX models
Signed-off-by: unknown <[email protected]>
1 parent d6d2e75 commit caa9965

File tree

4 files changed

+493
-160
lines changed

4 files changed

+493
-160
lines changed

examples/windows/onnx_ptq/genai_llm/quantize.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ def main(args):
365365
f"\n--Quantize-Script-- algo={args.algo}, dataset={args.dataset}, calib_size={args.calib_size}, "
366366
f"batch_size={args.batch_size}, block_size={args.block_size}, add-position-ids={args.add_position_ids}, "
367367
f"past-kv={args.add_past_kv_inputs}, rcalib={args.use_random_calib}, device={args.device}, "
368-
f"use_zero_point={args.use_zero_point}, use_fp32={args.use_fp32}\n"
368+
f"use_zero_point={args.use_zero_point}, use_fp32={args.use_fp32} k_quant_mixed={args.k_quant_mixed}\n"
369369
)
370370

371371
print(
@@ -435,6 +435,8 @@ def main(args):
435435
awqclip_alpha_step=args.awqclip_alpha_step,
436436
awqclip_alpha_min=args.awqclip_alpha_min,
437437
awqclip_bsz_col=args.awqclip_bsz_col,
438+
k_quant_mixed=args.k_quant_mixed,
439+
int8_layers=args.int8_layers,
438440
)
439441
logging.info(f"\nQuantization process took {time.time() - t} seconds")
440442

@@ -594,6 +596,20 @@ def main(args):
594596
default=False,
595597
action="store_true",
596598
)
597-
599+
parser.add_argument(
600+
"--k_quant_mixed",
601+
default=False,
602+
action="store_true",
603+
help="True when we want to use k_quant_mixed quantization",
604+
)
605+
parser.add_argument(
606+
"--int8_layers",
607+
type=str,
608+
default="",
609+
help=(
610+
"Comma-separated list of layer patterns to quantize to INT8 instead of INT4."
611+
"Example: 'layers.0,layers.1,lm_head'"
612+
),
613+
)
598614
args = parser.parse_args()
599615
main(args)

0 commit comments

Comments
 (0)