From 86193d70839321b03065e7ae37e8a069e8d44efa Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Tue, 9 Dec 2025 03:39:52 -0500 Subject: [PATCH 01/15] fix OOM issue and lm_head unsupport issue Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 8 ++++++++ .../quantization/auto_round/llama3/run_benchmark.sh | 4 ++-- .../quantization/auto_round/llama3/run_quant.sh | 9 +++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 1b3a01172ee..978afb054eb 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -88,6 +88,8 @@ Notes: ### Llama3 Quantization Recipes +Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%. + #### Llama 3.1 8B MXFP8 AutoRound tuning helps improve the accuracy, `iters` and `nsamples` is higher than default. @@ -131,6 +133,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy. CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP8 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8) `Target_bits=5.8` is an empirical value. @@ -147,6 +151,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy. CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-MXFP8 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.1 70B NVFP4 RTN (Round-to-Nearest) is enough to keep accuracy. @@ -155,6 +161,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy. CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.1 70B uNVFP4 RTN (Round-to-Nearest) is enough to keep accuracy. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 87b635be52f..9fb7e32e68b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -68,11 +68,11 @@ run_evaluation() { echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" # Print the command being executed - local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE" + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.8,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE" echo "Executing command: $cmd" lm_eval --model vllm \ - --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 \ + --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.8,data_parallel_size=1 \ --tasks $tasks \ --batch_size $BATCH_SIZE diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index d50deaf6b3c..91e56f182e7 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -95,13 +95,12 @@ case "$TOPOLOGY" in case "$DTYPE" in "mxfp8") echo "Running Llama 3.3 70B MXFP8 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype MXFP8 \ - --quant_lm_head \ --iters 0 \ --export_path "$OUTPUT_MODEL" ;; @@ -140,25 +139,23 @@ case "$TOPOLOGY" in case "$DTYPE" in "mxfp8") echo "Running Llama 3.1 70B MXFP8 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype MXFP8 \ - --quant_lm_head \ --iters 0 \ --export_path "$OUTPUT_MODEL" ;; "nvfp4") echo "Running Llama 3.1 70B NVFP4 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --quant_lm_head --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype NVFP4 \ - --quant_lm_head \ --iters 0 \ --export_format llm_compressor \ --export_path "$OUTPUT_MODEL" From c0522ecdb9783e59aff74202474cbb35a5b29d6c Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 11 Dec 2025 04:02:38 -0500 Subject: [PATCH 02/15] mem from 0.8 to 0.65 Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/run_benchmark.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 9fb7e32e68b..ef848b9735b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -68,11 +68,11 @@ run_evaluation() { echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" # Print the command being executed - local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.8,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE" + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.6,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE" echo "Executing command: $cmd" lm_eval --model vllm \ - --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.8,data_parallel_size=1 \ + --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.65,data_parallel_size=1 \ --tasks $tasks \ --batch_size $BATCH_SIZE From ab0366273825684c6b1eed13e4ea35721972ae09 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 12 Dec 2025 05:45:43 -0500 Subject: [PATCH 03/15] adapt gpu_memory_utilization for mxfp4/8 Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 16 ++++++++-------- .../auto_round/llama3/run_benchmark.sh | 12 +++++++++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 978afb054eb..c90a77b1ad2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -194,27 +194,27 @@ For convenience, we provide a benchmark script that automatically handles GPU de 1. **Llama 3.1 8B MXFP8** (1 GPU): ```bash -CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 --gpu_memory_utilization=0.8 ``` 2. **Llama 3.1 8B MXFP4 Mixed** (1 GPU): ```bash -CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8 --gpu_memory_utilization=0.6 ``` -3. **Llama 3.3 70B MXFP8** (4 GPU): +3. **Llama 3.3 70B MXFP8** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8 --gpu_memory_utilization=0.8 ``` -4. **Llama 3.3 70B MXFP4 Mixed** (4 GPU): +4. **Llama 3.3 70B MXFP4 Mixed** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8 --gpu_memory_utilization=0.6 ``` -5. **Llama 3.1 70B MXFP8** (4 GPU): +5. **Llama 3.1 70B MXFP8** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 --gpu_memory_utilization=0.8 ``` The script automatically: diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index ef848b9735b..a33646756eb 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -4,7 +4,8 @@ # Parse command line arguments TASKS="piqa,hellaswag,mmlu,gsm8k" -BATCH_SIZE=8 +BATCH_SIZE=512 +GPU_MEMORY_UTILIZATION=0.8 while [[ $# -gt 0 ]]; do case $1 in @@ -20,6 +21,10 @@ while [[ $# -gt 0 ]]; do BATCH_SIZE="${1#*=}" shift ;; + --gpu_memory_utilization=*) + GPU_MEMORY_UTILIZATION="${1#*=}" + shift + ;; *) echo "Unknown parameter: $1" exit 1 @@ -48,6 +53,7 @@ echo " Model Path: $MODEL_PATH" echo " Tasks: $TASKS" echo " Batch Size: $BATCH_SIZE" echo " Tensor Parallel Size: $TENSOR_PARALLEL_SIZE" +echo " GPU Memory Utilization: $GPU_MEMORY_UTILIZATION" echo " CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # Check if the model exists @@ -68,11 +74,11 @@ run_evaluation() { echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" # Print the command being executed - local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.6,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE" + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,max_num_batched_tokens=32768 --tasks $tasks --batch_size $BATCH_SIZE" echo "Executing command: $cmd" lm_eval --model vllm \ - --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.65,data_parallel_size=1 \ + --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,max_num_batched_tokens=32768 \ --tasks $tasks \ --batch_size $BATCH_SIZE From 6979ad168db1068aec797b249f4e98d6fbe2431f Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 12 Dec 2025 05:49:59 -0500 Subject: [PATCH 04/15] fix bug Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/run_benchmark.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index a33646756eb..ae0fa5c0e08 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -74,11 +74,11 @@ run_evaluation() { echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" # Print the command being executed - local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,max_num_batched_tokens=32768 --tasks $tasks --batch_size $BATCH_SIZE" + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE" echo "Executing command: $cmd" lm_eval --model vllm \ - --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,max_num_batched_tokens=32768 \ + --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \ --tasks $tasks \ --batch_size $BATCH_SIZE From 0628fd3983065a94366876859bf510ae1b971017 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 12 Dec 2025 06:03:44 -0500 Subject: [PATCH 05/15] reasonable batch size for time estimation Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index ae0fa5c0e08..02bbeaba8af 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -4,7 +4,7 @@ # Parse command line arguments TASKS="piqa,hellaswag,mmlu,gsm8k" -BATCH_SIZE=512 +BATCH_SIZE=64 GPU_MEMORY_UTILIZATION=0.8 while [[ $# -gt 0 ]]; do From ce2b6bfeb81644b2130cd73820eb73883e700c80 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Mon, 15 Dec 2025 21:48:21 -0500 Subject: [PATCH 06/15] increase target bits for llama3.3 70b mxfp4_mixed Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 4 ++-- .../quantization/auto_round/llama3/quantize.py | 4 ++-- .../quantization/auto_round/llama3/run_quant.sh | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index c90a77b1ad2..a2b91e4a83b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -135,9 +135,9 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 > Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. -#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8) +#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=6.0) -`Target_bits=5.8` is an empirical value. +`Target_bits=6.0` is an empirical value. ```bash CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp4_mixed --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP4-MXFP8 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index 7824425bb0e..985bb58a592 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -150,7 +150,7 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): help="options for mix precision" ) parser.add_argument( - "--shared_layer", + "--shared_layers", type=str, nargs="+", action='append', @@ -242,7 +242,7 @@ def load_recipe_results(file_path): scheme=args.dtype, target_bits=args.target_bits, options=args.options, - shared_layers=args.shared_layer, + shared_layers=args.shared_layers, enable_torch_compile=args.enable_torch_compile, low_gpu_mem_usage=args.low_gpu_mem_usage, export_format=args.export_format, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index 91e56f182e7..bc2f1cb85d9 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -73,15 +73,15 @@ case "$TOPOLOGY" in ;; "mxfp4_mixed") echo "Running Llama 3.1 8B MXFP4 (Mixed with MXFP8) quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 7.8 --options \"MXFP4\" \"MXFP8\" --shared_layer \"k_proj\" \"v_proj\" \"q_proj\" --shared_layer \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 7.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --target_bits 7.8 \ --options "MXFP4" "MXFP8" \ - --shared_layer "k_proj" "v_proj" "q_proj" \ - --shared_layer "gate_proj" "up_proj" \ + --shared_layers "k_proj" "v_proj" "q_proj" \ + --shared_layers "gate_proj" "up_proj" \ --export_path "$OUTPUT_MODEL" ;; *) @@ -117,15 +117,15 @@ case "$TOPOLOGY" in ;; "mxfp4_mixed") echo "Running Llama 3.3 70B MXFP4 (Mixed with MXFP8) quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layer \"k_proj\" \"v_proj\" \"q_proj\" --shared_layer \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 6.0 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ - --target_bits 5.8 \ + --target_bits 6.0 \ --options "MXFP4" "MXFP8" \ - --shared_layer "k_proj" "v_proj" "q_proj" \ - --shared_layer "gate_proj" "up_proj" \ + --shared_layers "k_proj" "v_proj" "q_proj" \ + --shared_layers "gate_proj" "up_proj" \ --export_path "$OUTPUT_MODEL" ;; *) From 31b894061276d04bfba9bb1aa7962d6de64a55d0 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Mon, 15 Dec 2025 22:28:11 -0500 Subject: [PATCH 07/15] fix typo Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index 985bb58a592..53c459120b4 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -198,7 +198,7 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): print("Target data type:", args.dtype) else: print("Target data type for mix precision:", args.options) - print("Layers sharing the same data type:", args.shared_layer) + print("Layers sharing the same data type:", args.shared_layers) model, tokenizer = initialize_model_and_tokenizer(args.model_name_or_path) if args.quantize: From e7541429cf494c11a4be50783ebff978853f66a1 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 17 Dec 2025 03:50:39 -0500 Subject: [PATCH 08/15] add tuning for nvfp4 Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/run_quant.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index bc2f1cb85d9..ad6ec55354c 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -150,13 +150,12 @@ case "$TOPOLOGY" in ;; "nvfp4") echo "Running Llama 3.1 70B NVFP4 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype NVFP4 \ - --iters 0 \ --export_format llm_compressor \ --export_path "$OUTPUT_MODEL" ;; @@ -196,4 +195,4 @@ if [[ $? -eq 0 ]]; then else echo "Quantization failed!" exit 1 -fi \ No newline at end of file +fi From 194f61a8080dc50beb326bb36cba87488a630bbc Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 17 Dec 2025 09:37:23 -0500 Subject: [PATCH 09/15] apply chat template for benchmark Signed-off-by: He, Xin3 --- .../auto_round/llama3/run_benchmark.sh | 34 +++---------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 02bbeaba8af..059aeed473f 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -80,7 +80,9 @@ run_evaluation() { lm_eval --model vllm \ --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \ --tasks $tasks \ - --batch_size $BATCH_SIZE + --batch_size $BATCH_SIZE \ + --apply_chat_template \ + --fewshot_as_multiturn if [[ $? -ne 0 ]]; then echo "Error: Evaluation failed for tasks: $tasks" @@ -88,34 +90,8 @@ run_evaluation() { fi } -# Check if tasks contain gsm8k (requires add_bos_token=False) -if [[ "$TASKS" == *"gsm8k"* ]]; then - # If gsm8k is the only task - if [[ "$TASKS" == "gsm8k" ]]; then - run_evaluation "$TASKS" false - else - # Split tasks: run gsm8k separately with add_bos_token=False - OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//') - - if [[ -n "$OTHER_TASKS" ]]; then - echo "Running general tasks with add_bos_token=True" - run_evaluation "$OTHER_TASKS" true - - if [[ $? -eq 0 ]]; then - echo "Running GSM8K with add_bos_token=False" - run_evaluation "gsm8k" false - else - echo "Skipping GSM8K due to previous failure" - exit 1 - fi - else - run_evaluation "gsm8k" false - fi - fi -else - # No gsm8k task, use add_bos_token=True for all tasks - run_evaluation "$TASKS" true -fi +# Run all tasks together with add_bos_token=True +run_evaluation "$TASKS" true if [[ $? -eq 0 ]]; then echo "Benchmark completed successfully!" From 9e47ba7205ab77fd8df19af76e70462146432d32 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 17 Dec 2025 10:30:37 -0500 Subject: [PATCH 10/15] apply chat only for gsm8k Signed-off-by: He, Xin3 --- .../auto_round/llama3/quantize.py | 8 ++++- .../auto_round/llama3/run_benchmark.sh | 36 ++++++++++++++++--- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index 53c459120b4..3ada4fe97bf 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -82,15 +82,21 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): ########################## gms8k (ahead of normal tasks) ######################### if test_gsm8k: + if tokenizer.chat_template: + apply_chat_template, fewshot_as_multiturn = True, True + else: + apply_chat_template, fewshot_as_multiturn = False, False lm = HFLM( pretrained=model_name_or_path, tokenizer=tokenizer, - add_bos_token=False, + add_bos_token=True, batch_size=args.eval_batch_size, ) results_gsm8k = lm_eval.simple_evaluate( lm, tasks=["gsm8k"], + apply_chat_template=apply_chat_template, + fewshot_as_multiturn=fewshot_as_multiturn, limit=args.limit if limit is None else limit, ) for task_name, task_results in results_gsm8k["results"].items(): diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 059aeed473f..63a361b3994 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -70,19 +70,19 @@ export TORCH_COMPILE_DISABLE=1 run_evaluation() { local tasks=$1 local add_bos_token=$2 + local extra_args=$3 echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" # Print the command being executed - local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE" + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE $extra_args" echo "Executing command: $cmd" lm_eval --model vllm \ --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \ --tasks $tasks \ --batch_size $BATCH_SIZE \ - --apply_chat_template \ - --fewshot_as_multiturn + $extra_args if [[ $? -ne 0 ]]; then echo "Error: Evaluation failed for tasks: $tasks" @@ -90,8 +90,34 @@ run_evaluation() { fi } -# Run all tasks together with add_bos_token=True -run_evaluation "$TASKS" true +# Check if tasks contain gsm8k +if [[ "$TASKS" == *"gsm8k"* ]]; then + # If gsm8k is the only task + if [[ "$TASKS" == "gsm8k" ]]; then + run_evaluation "$TASKS" true "--apply_chat_template --fewshot_as_multiturn" + else + # Split tasks: run gsm8k separately + OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//') + + if [[ -n "$OTHER_TASKS" ]]; then + echo "Running general tasks" + run_evaluation "$OTHER_TASKS" true "" + + if [[ $? -eq 0 ]]; then + echo "Running GSM8K with chat template" + run_evaluation "gsm8k" true "--apply_chat_template --fewshot_as_multiturn" + else + echo "Skipping GSM8K due to previous failure" + exit 1 + fi + else + run_evaluation "gsm8k" true "--apply_chat_template --fewshot_as_multiturn" + fi + fi +else + # No gsm8k task + run_evaluation "$TASKS" true "" +fi if [[ $? -eq 0 ]]; then echo "Benchmark completed successfully!" From f0f200adb04d7dfe409f099826368a784abb2184 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 18 Dec 2025 00:12:28 -0500 Subject: [PATCH 11/15] change to mmlu_llama, gsm8k_llama Signed-off-by: He, Xin3 --- .../auto_round/llama3/quantize.py | 79 +++++++++---------- .../auto_round/llama3/run_benchmark.sh | 73 +++++++++++------ 2 files changed, 88 insertions(+), 64 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index 3ada4fe97bf..f51fb19a8c6 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -65,58 +65,57 @@ def dispatch_model_on_devices(model): return model + @torch.no_grad() -def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): +def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=None): os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") - eval_tasks = copy.deepcopy(tasks) # avoid removing gsm8k from original list all_accuracy = {} - test_gsm8k = False - test_normal = False - if "gsm8k" in eval_tasks: - test_gsm8k = True - eval_tasks.remove("gsm8k") - if eval_tasks: - test_normal = True + special_tasks = [] + normal_tasks = [] + # Identify special tasks + for t in eval_tasks: + if t in ["gsm8k_llama", "mmlu_llama"]: + special_tasks.append(t) + else: + normal_tasks.append(t) import lm_eval from lm_eval.models.huggingface import HFLM - ########################## gms8k (ahead of normal tasks) ######################### - if test_gsm8k: - if tokenizer.chat_template: - apply_chat_template, fewshot_as_multiturn = True, True - else: - apply_chat_template, fewshot_as_multiturn = False, False - lm = HFLM( - pretrained=model_name_or_path, - tokenizer=tokenizer, - add_bos_token=True, - batch_size=args.eval_batch_size, - ) - results_gsm8k = lm_eval.simple_evaluate( + lm = HFLM( + pretrained=model_name_or_path, + tokenizer=tokenizer, + add_bos_token=True, + batch_size=args.eval_batch_size, + ) + # Run special tasks with chat template + for special_task in special_tasks: + results_special = lm_eval.simple_evaluate( lm, - tasks=["gsm8k"], - apply_chat_template=apply_chat_template, - fewshot_as_multiturn=fewshot_as_multiturn, + tasks=[special_task], + apply_chat_template=True, + fewshot_as_multiturn=True, limit=args.limit if limit is None else limit, ) - for task_name, task_results in results_gsm8k["results"].items(): - accu = task_results["exact_match,strict-match"] - all_accuracy[task_name] = accu - ########################## gms8k end ######################### - if test_normal: - lm = HFLM( - pretrained=model_name_or_path, - tokenizer=tokenizer, - add_bos_token=True, - batch_size=args.eval_batch_size, - ) + for task_name, task_results in results_special["results"].items(): + # gsm8k_llama uses exact_match,strict-match, mmlu_llama may use acc,none + if task_name in special_tasks: + if "exact_match,strict_match" in task_results: + accu = task_results["exact_match,strict_match"] + elif "acc,none" in task_results: + accu = task_results["acc,none"] + else: + accu = list(task_results.values())[0] + all_accuracy[task_name] = accu + + # Run normal tasks without chat template + if normal_tasks: results = lm_eval.simple_evaluate( lm, - tasks=eval_tasks, + tasks=normal_tasks, limit=args.limit if limit is None else limit, ) for task_name, task_results in results["results"].items(): - if "acc,none" in task_results and task_name in eval_tasks: + if "acc,none" in task_results and task_name in normal_tasks: accu = task_results["acc,none"] all_accuracy[task_name] = accu for task_name, accu in all_accuracy.items(): @@ -191,8 +190,8 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): default=[ "piqa", "hellaswag", - "mmlu", - "gsm8k", + "mmlu_llama", + "gsm8k_llama", ], help="tasks for accuracy validation, text-generation and code-generation tasks are different.", ) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 63a361b3994..6a07fbd9991 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -3,7 +3,7 @@ # Usage: CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path= [--tasks=] [--batch_size=] # Parse command line arguments -TASKS="piqa,hellaswag,mmlu,gsm8k" +TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama" BATCH_SIZE=64 GPU_MEMORY_UTILIZATION=0.8 @@ -83,39 +83,64 @@ run_evaluation() { --tasks $tasks \ --batch_size $BATCH_SIZE \ $extra_args - + if [[ $? -ne 0 ]]; then echo "Error: Evaluation failed for tasks: $tasks" return 1 fi } -# Check if tasks contain gsm8k -if [[ "$TASKS" == *"gsm8k"* ]]; then - # If gsm8k is the only task - if [[ "$TASKS" == "gsm8k" ]]; then - run_evaluation "$TASKS" true "--apply_chat_template --fewshot_as_multiturn" + +# Check if tasks contain gsm8k_llama or mmlu_llama +NEED_SPLIT=false +OTHER_TASKS="$TASKS" +SPECIAL_TASKS="" + +if [[ "$TASKS" == *"gsm8k_llama"* ]]; then + SPECIAL_TASKS="gsm8k_llama" + OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*gsm8k_llama,*//' | sed 's/^,//' | sed 's/,$//') + NEED_SPLIT=true +fi +if [[ "$TASKS" == *"mmlu_llama"* ]]; then + if [[ -n "$SPECIAL_TASKS" ]]; then + SPECIAL_TASKS="$SPECIAL_TASKS,mmlu_llama" else - # Split tasks: run gsm8k separately - OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//') - - if [[ -n "$OTHER_TASKS" ]]; then - echo "Running general tasks" - run_evaluation "$OTHER_TASKS" true "" - - if [[ $? -eq 0 ]]; then - echo "Running GSM8K with chat template" - run_evaluation "gsm8k" true "--apply_chat_template --fewshot_as_multiturn" - else - echo "Skipping GSM8K due to previous failure" - exit 1 - fi + SPECIAL_TASKS="mmlu_llama" + fi + OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*mmlu_llama,*//' | sed 's/^,//' | sed 's/,$//') + NEED_SPLIT=true +fi + +if [[ "$NEED_SPLIT" == true ]]; then + if [[ -n "$OTHER_TASKS" ]]; then + echo "Running general tasks" + run_evaluation "$OTHER_TASKS" true "" + if [[ $? -eq 0 ]]; then + IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS" + for special_task in "${SPECIAL_ARRAY[@]}"; do + echo "Running $special_task with chat template" + run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn" + if [[ $? -ne 0 ]]; then + echo "Benchmark failed on $special_task!" + exit 1 + fi + done else - run_evaluation "gsm8k" true "--apply_chat_template --fewshot_as_multiturn" + echo "Skipping special tasks due to previous failure" + exit 1 fi + else + IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS" + for special_task in "${SPECIAL_ARRAY[@]}"; do + echo "Running $special_task with chat template" + run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn" + if [[ $? -ne 0 ]]; then + echo "Benchmark failed on $special_task!" + exit 1 + fi + done fi else - # No gsm8k task run_evaluation "$TASKS" true "" fi @@ -124,4 +149,4 @@ if [[ $? -eq 0 ]]; then else echo "Benchmark failed!" exit 1 -fi \ No newline at end of file +fi From e1c57467366b0ed392d283640ac0165fd8b71a9f Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 18 Dec 2025 21:19:48 -0500 Subject: [PATCH 12/15] recover bits=5.8 and rtn for nvfp4 Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 4 ++-- .../quantization/auto_round/llama3/run_quant.sh | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index a2b91e4a83b..c90a77b1ad2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -135,9 +135,9 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 > Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. -#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=6.0) +#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8) -`Target_bits=6.0` is an empirical value. +`Target_bits=5.8` is an empirical value. ```bash CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp4_mixed --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP4-MXFP8 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index ad6ec55354c..46d7581033b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -117,12 +117,12 @@ case "$TOPOLOGY" in ;; "mxfp4_mixed") echo "Running Llama 3.3 70B MXFP4 (Mixed with MXFP8) quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 6.0 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ - --target_bits 6.0 \ + --target_bits 5.8 \ --options "MXFP4" "MXFP8" \ --shared_layers "k_proj" "v_proj" "q_proj" \ --shared_layers "gate_proj" "up_proj" \ @@ -150,12 +150,13 @@ case "$TOPOLOGY" in ;; "nvfp4") echo "Running Llama 3.1 70B NVFP4 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype NVFP4 \ + --iters 0 \ --export_format llm_compressor \ --export_path "$OUTPUT_MODEL" ;; From 906c259fd89e95e866aaef76cedd95f0c89292b7 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 19 Dec 2025 00:51:07 -0500 Subject: [PATCH 13/15] add autoround tuning Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/README.md | 4 ++-- .../quantization/auto_round/llama3/run_quant.sh | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index c90a77b1ad2..da07f4176ae 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -155,10 +155,10 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 #### Llama 3.1 70B NVFP4 -RTN (Round-to-Nearest) is enough to keep accuracy. +AutoRound tuning helps improve the accuracy. ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4 +CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4 ``` > Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index 46d7581033b..fa02b476739 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -150,13 +150,12 @@ case "$TOPOLOGY" in ;; "nvfp4") echo "Running Llama 3.1 70B NVFP4 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype NVFP4 \ - --iters 0 \ --export_format llm_compressor \ --export_path "$OUTPUT_MODEL" ;; From c198b0b2757e3f45f1796432a5bc3ec27d5dd193 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Sun, 21 Dec 2025 06:13:40 -0500 Subject: [PATCH 14/15] remove torch_compile for nvfp4 Signed-off-by: He, Xin3 --- .../quantization/auto_round/llama3/run_quant.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index fa02b476739..5ac00da274a 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -150,11 +150,12 @@ case "$TOPOLOGY" in ;; "nvfp4") echo "Running Llama 3.1 70B NVFP4 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" --quantize --low_gpu_mem_usage --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ - $COMMON_ARGS \ + --quantize \ + --low_gpu_mem_usage \ --dtype NVFP4 \ --export_format llm_compressor \ --export_path "$OUTPUT_MODEL" From 100743a1b027cd2d5a6d163562d5848e9c57a4a1 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Sun, 21 Dec 2025 06:15:37 -0500 Subject: [PATCH 15/15] add 1 more card for nvfp4 quant Signed-off-by: He, Xin3 --- .../language-modeling/quantization/auto_round/llama3/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index da07f4176ae..a7872952696 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -158,7 +158,7 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 AutoRound tuning helps improve the accuracy. ```bash -CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4 +CUDA_VISIBLE_DEVICES=0,1 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4 ``` > Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.