From 86193d70839321b03065e7ae37e8a069e8d44efa Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Tue, 9 Dec 2025 03:39:52 -0500
Subject: [PATCH 01/15] fix OOM issue and lm_head unsupport issue

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/README.md             | 8 ++++++++
 .../quantization/auto_round/llama3/run_benchmark.sh      | 4 ++--
 .../quantization/auto_round/llama3/run_quant.sh          | 9 +++------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
index 1b3a01172ee..978afb054eb 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -88,6 +88,8 @@ Notes:
 
 ### Llama3 Quantization Recipes
 
+Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%.
+
 #### Llama 3.1 8B MXFP8
 
 AutoRound tuning helps improve the accuracy, `iters` and `nsamples` is higher than default.
@@ -131,6 +133,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy.
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP8
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8)
 
 `Target_bits=5.8` is an empirical value.
@@ -147,6 +151,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy.
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-MXFP8
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.1 70B NVFP4
 
 RTN (Round-to-Nearest) is enough to keep accuracy.
@@ -155,6 +161,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy.
 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.1 70B uNVFP4
 
 RTN (Round-to-Nearest) is enough to keep accuracy.
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index 87b635be52f..9fb7e32e68b 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -68,11 +68,11 @@ run_evaluation() {
     echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)"
     
     # Print the command being executed
-    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE"
+    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.8,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE"
     echo "Executing command: $cmd"
     
     lm_eval --model vllm \
-        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 \
+        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.8,data_parallel_size=1 \
         --tasks $tasks \
         --batch_size $BATCH_SIZE
         
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index d50deaf6b3c..91e56f182e7 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -95,13 +95,12 @@ case "$TOPOLOGY" in
         case "$DTYPE" in
             "mxfp8")
                 echo "Running Llama 3.3 70B MXFP8 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype MXFP8 \
-                    --quant_lm_head \
                     --iters 0 \
                     --export_path "$OUTPUT_MODEL"
                 ;;
@@ -140,25 +139,23 @@ case "$TOPOLOGY" in
         case "$DTYPE" in
             "mxfp8")
                 echo "Running Llama 3.1 70B MXFP8 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype MXFP8 \
-                    --quant_lm_head \
                     --iters 0 \
                     --export_path "$OUTPUT_MODEL"
                 ;;
             "nvfp4")
                 echo "Running Llama 3.1 70B NVFP4 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --quant_lm_head --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype NVFP4 \
-                    --quant_lm_head \
                     --iters 0 \
                     --export_format llm_compressor \
                     --export_path "$OUTPUT_MODEL"

From c0522ecdb9783e59aff74202474cbb35a5b29d6c Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 11 Dec 2025 04:02:38 -0500
Subject: [PATCH 02/15] mem from 0.8 to 0.65

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/run_benchmark.sh           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index 9fb7e32e68b..ef848b9735b 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -68,11 +68,11 @@ run_evaluation() {
     echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)"
     
     # Print the command being executed
-    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.8,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE"
+    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.6,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE"
     echo "Executing command: $cmd"
     
     lm_eval --model vllm \
-        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.8,data_parallel_size=1 \
+        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.65,data_parallel_size=1 \
         --tasks $tasks \
         --batch_size $BATCH_SIZE
         

From ab0366273825684c6b1eed13e4ea35721972ae09 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 12 Dec 2025 05:45:43 -0500
Subject: [PATCH 03/15] adapt gpu_memory_utilization for mxfp4/8

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/README.md     | 16 ++++++++--------
 .../auto_round/llama3/run_benchmark.sh           | 12 +++++++++---
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
index 978afb054eb..c90a77b1ad2 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -194,27 +194,27 @@ For convenience, we provide a benchmark script that automatically handles GPU de
 
 1. **Llama 3.1 8B MXFP8** (1 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8
+CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 --gpu_memory_utilization=0.8
 ```
 
 2. **Llama 3.1 8B MXFP4 Mixed** (1 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8
+CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8  --gpu_memory_utilization=0.6
 ```
 
-3. **Llama 3.3 70B MXFP8** (4 GPU):
+3. **Llama 3.3 70B MXFP8** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8  --gpu_memory_utilization=0.8
 ```
 
-4. **Llama 3.3 70B MXFP4 Mixed** (4 GPU):
+4. **Llama 3.3 70B MXFP4 Mixed** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8  --gpu_memory_utilization=0.6
 ```
 
-5. **Llama 3.1 70B MXFP8** (4 GPU):
+5. **Llama 3.1 70B MXFP8** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8   --gpu_memory_utilization=0.8
 ```
 
 The script automatically:
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index ef848b9735b..a33646756eb 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -4,7 +4,8 @@
 
 # Parse command line arguments
 TASKS="piqa,hellaswag,mmlu,gsm8k"
-BATCH_SIZE=8
+BATCH_SIZE=512
+GPU_MEMORY_UTILIZATION=0.8
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -20,6 +21,10 @@ while [[ $# -gt 0 ]]; do
             BATCH_SIZE="${1#*=}"
             shift
             ;;
+        --gpu_memory_utilization=*)
+            GPU_MEMORY_UTILIZATION="${1#*=}"
+            shift
+            ;;
         *)
             echo "Unknown parameter: $1"
             exit 1
@@ -48,6 +53,7 @@ echo "  Model Path: $MODEL_PATH"
 echo "  Tasks: $TASKS"
 echo "  Batch Size: $BATCH_SIZE"
 echo "  Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
+echo "  GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
 echo "  CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
 
 # Check if the model exists
@@ -68,11 +74,11 @@ run_evaluation() {
     echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)"
     
     # Print the command being executed
-    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.6,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE"
+    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,max_num_batched_tokens=32768 --tasks $tasks --batch_size $BATCH_SIZE"
     echo "Executing command: $cmd"
     
     lm_eval --model vllm \
-        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=0.65,data_parallel_size=1 \
+        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,max_num_batched_tokens=32768 \
         --tasks $tasks \
         --batch_size $BATCH_SIZE
         

From 6979ad168db1068aec797b249f4e98d6fbe2431f Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 12 Dec 2025 05:49:59 -0500
Subject: [PATCH 04/15] fix bug

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/run_benchmark.sh           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index a33646756eb..ae0fa5c0e08 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -74,11 +74,11 @@ run_evaluation() {
     echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)"
     
     # Print the command being executed
-    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,max_num_batched_tokens=32768 --tasks $tasks --batch_size $BATCH_SIZE"
+    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE"
     echo "Executing command: $cmd"
     
     lm_eval --model vllm \
-        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,max_num_batched_tokens=32768 \
+        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \
         --tasks $tasks \
         --batch_size $BATCH_SIZE
         

From 0628fd3983065a94366876859bf510ae1b971017 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 12 Dec 2025 06:03:44 -0500
Subject: [PATCH 05/15] reasonable batch size for time estimation

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/run_benchmark.sh             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index ae0fa5c0e08..02bbeaba8af 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -4,7 +4,7 @@
 
 # Parse command line arguments
 TASKS="piqa,hellaswag,mmlu,gsm8k"
-BATCH_SIZE=512
+BATCH_SIZE=64
 GPU_MEMORY_UTILIZATION=0.8
 
 while [[ $# -gt 0 ]]; do

From ce2b6bfeb81644b2130cd73820eb73883e700c80 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Mon, 15 Dec 2025 21:48:21 -0500
Subject: [PATCH 06/15] increase target bits for llama3.3 70b mxfp4_mixed

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/README.md       |  4 ++--
 .../quantization/auto_round/llama3/quantize.py     |  4 ++--
 .../quantization/auto_round/llama3/run_quant.sh    | 14 +++++++-------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
index c90a77b1ad2..a2b91e4a83b 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -135,9 +135,9 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8
 
 > Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
 
-#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8)
+#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=6.0)
 
-`Target_bits=5.8` is an empirical value.
+`Target_bits=6.0` is an empirical value.
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp4_mixed --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP4-MXFP8
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
index 7824425bb0e..985bb58a592 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
@@ -150,7 +150,7 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
         help="options for mix precision"
     )
     parser.add_argument(
-        "--shared_layer",
+        "--shared_layers",
         type=str,
         nargs="+",
         action='append',
@@ -242,7 +242,7 @@ def load_recipe_results(file_path):
             scheme=args.dtype,
             target_bits=args.target_bits,
             options=args.options,
-            shared_layers=args.shared_layer,
+            shared_layers=args.shared_layers,
             enable_torch_compile=args.enable_torch_compile,
             low_gpu_mem_usage=args.low_gpu_mem_usage,
             export_format=args.export_format,
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index 91e56f182e7..bc2f1cb85d9 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -73,15 +73,15 @@ case "$TOPOLOGY" in
                 ;;
             "mxfp4_mixed")
                 echo "Running Llama 3.1 8B MXFP4 (Mixed with MXFP8) quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 7.8 --options \"MXFP4\" \"MXFP8\" --shared_layer \"k_proj\" \"v_proj\" \"q_proj\" --shared_layer \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 7.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --target_bits 7.8 \
                     --options "MXFP4" "MXFP8" \
-                    --shared_layer "k_proj" "v_proj" "q_proj" \
-                    --shared_layer "gate_proj" "up_proj" \
+                    --shared_layers "k_proj" "v_proj" "q_proj" \
+                    --shared_layers "gate_proj" "up_proj" \
                     --export_path "$OUTPUT_MODEL"
                 ;;
             *)
@@ -117,15 +117,15 @@ case "$TOPOLOGY" in
                 ;;
             "mxfp4_mixed")
                 echo "Running Llama 3.3 70B MXFP4 (Mixed with MXFP8) quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layer \"k_proj\" \"v_proj\" \"q_proj\" --shared_layer \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 6.0 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
-                    --target_bits 5.8 \
+                    --target_bits 6.0 \
                     --options "MXFP4" "MXFP8" \
-                    --shared_layer "k_proj" "v_proj" "q_proj" \
-                    --shared_layer "gate_proj" "up_proj" \
+                    --shared_layers "k_proj" "v_proj" "q_proj" \
+                    --shared_layers "gate_proj" "up_proj" \
                     --export_path "$OUTPUT_MODEL"
                 ;;
             *)

From 31b894061276d04bfba9bb1aa7962d6de64a55d0 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Mon, 15 Dec 2025 22:28:11 -0500
Subject: [PATCH 07/15] fix typo

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/quantize.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
index 985bb58a592..53c459120b4 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
@@ -198,7 +198,7 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
         print("Target data type:", args.dtype)
     else:
         print("Target data type for mix precision:", args.options)
-        print("Layers sharing the same data type:", args.shared_layer)
+        print("Layers sharing the same data type:", args.shared_layers)
     model, tokenizer = initialize_model_and_tokenizer(args.model_name_or_path)
 
     if args.quantize:

From e7541429cf494c11a4be50783ebff978853f66a1 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 17 Dec 2025 03:50:39 -0500
Subject: [PATCH 08/15] add tuning for nvfp4

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/run_quant.sh              | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index bc2f1cb85d9..ad6ec55354c 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -150,13 +150,12 @@ case "$TOPOLOGY" in
                 ;;
             "nvfp4")
                 echo "Running Llama 3.1 70B NVFP4 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype NVFP4 \
-                    --iters 0 \
                     --export_format llm_compressor \
                     --export_path "$OUTPUT_MODEL"
                 ;;
@@ -196,4 +195,4 @@ if [[ $? -eq 0 ]]; then
 else
     echo "Quantization failed!"
     exit 1
-fi
\ No newline at end of file
+fi

From 194f61a8080dc50beb326bb36cba87488a630bbc Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 17 Dec 2025 09:37:23 -0500
Subject: [PATCH 09/15] apply chat template for benchmark

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../auto_round/llama3/run_benchmark.sh        | 34 +++----------------
 1 file changed, 5 insertions(+), 29 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index 02bbeaba8af..059aeed473f 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -80,7 +80,9 @@ run_evaluation() {
     lm_eval --model vllm \
         --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \
         --tasks $tasks \
-        --batch_size $BATCH_SIZE
+        --batch_size $BATCH_SIZE \
+        --apply_chat_template \
+        --fewshot_as_multiturn
         
     if [[ $? -ne 0 ]]; then
         echo "Error: Evaluation failed for tasks: $tasks"
@@ -88,34 +90,8 @@ run_evaluation() {
     fi
 }
 
-# Check if tasks contain gsm8k (requires add_bos_token=False)
-if [[ "$TASKS" == *"gsm8k"* ]]; then
-    # If gsm8k is the only task
-    if [[ "$TASKS" == "gsm8k" ]]; then
-        run_evaluation "$TASKS" false
-    else
-        # Split tasks: run gsm8k separately with add_bos_token=False
-        OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//')
-        
-        if [[ -n "$OTHER_TASKS" ]]; then
-            echo "Running general tasks with add_bos_token=True"
-            run_evaluation "$OTHER_TASKS" true
-            
-            if [[ $? -eq 0 ]]; then
-                echo "Running GSM8K with add_bos_token=False"
-                run_evaluation "gsm8k" false
-            else
-                echo "Skipping GSM8K due to previous failure"
-                exit 1
-            fi
-        else
-            run_evaluation "gsm8k" false
-        fi
-    fi
-else
-    # No gsm8k task, use add_bos_token=True for all tasks
-    run_evaluation "$TASKS" true
-fi
+# Run all tasks together with add_bos_token=True
+run_evaluation "$TASKS" true
 
 if [[ $? -eq 0 ]]; then
     echo "Benchmark completed successfully!"

From 9e47ba7205ab77fd8df19af76e70462146432d32 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 17 Dec 2025 10:30:37 -0500
Subject: [PATCH 10/15] apply chat only for gsm8k

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../auto_round/llama3/quantize.py             |  8 ++++-
 .../auto_round/llama3/run_benchmark.sh        | 36 ++++++++++++++++---
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
index 53c459120b4..3ada4fe97bf 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
@@ -82,15 +82,21 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
 
     ########################## gms8k (ahead of normal tasks) #########################
     if test_gsm8k:
+        if tokenizer.chat_template:
+            apply_chat_template, fewshot_as_multiturn = True, True
+        else:
+            apply_chat_template, fewshot_as_multiturn = False, False
         lm = HFLM(
             pretrained=model_name_or_path,
             tokenizer=tokenizer,
-            add_bos_token=False,
+            add_bos_token=True,
             batch_size=args.eval_batch_size,
         )
         results_gsm8k = lm_eval.simple_evaluate(
             lm,
             tasks=["gsm8k"],
+            apply_chat_template=apply_chat_template,
+            fewshot_as_multiturn=fewshot_as_multiturn,
             limit=args.limit if limit is None else limit,
         )
         for task_name, task_results in results_gsm8k["results"].items():
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index 059aeed473f..63a361b3994 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -70,19 +70,19 @@ export TORCH_COMPILE_DISABLE=1
 run_evaluation() {
     local tasks=$1
     local add_bos_token=$2
+    local extra_args=$3
     
     echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)"
     
     # Print the command being executed
-    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE"
+    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE $extra_args"
     echo "Executing command: $cmd"
     
     lm_eval --model vllm \
         --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \
         --tasks $tasks \
         --batch_size $BATCH_SIZE \
-        --apply_chat_template \
-        --fewshot_as_multiturn
+        $extra_args
         
     if [[ $? -ne 0 ]]; then
         echo "Error: Evaluation failed for tasks: $tasks"
@@ -90,8 +90,34 @@ run_evaluation() {
     fi
 }
 
-# Run all tasks together with add_bos_token=True
-run_evaluation "$TASKS" true
+# Check if tasks contain gsm8k
+if [[ "$TASKS" == *"gsm8k"* ]]; then
+    # If gsm8k is the only task
+    if [[ "$TASKS" == "gsm8k" ]]; then
+        run_evaluation "$TASKS" true "--apply_chat_template --fewshot_as_multiturn"
+    else
+        # Split tasks: run gsm8k separately
+        OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//')
+        
+        if [[ -n "$OTHER_TASKS" ]]; then
+            echo "Running general tasks"
+            run_evaluation "$OTHER_TASKS" true ""
+            
+            if [[ $? -eq 0 ]]; then
+                echo "Running GSM8K with chat template"
+                run_evaluation "gsm8k" true "--apply_chat_template --fewshot_as_multiturn"
+            else
+                echo "Skipping GSM8K due to previous failure"
+                exit 1
+            fi
+        else
+            run_evaluation "gsm8k" true "--apply_chat_template --fewshot_as_multiturn"
+        fi
+    fi
+else
+    # No gsm8k task
+    run_evaluation "$TASKS" true ""
+fi
 
 if [[ $? -eq 0 ]]; then
     echo "Benchmark completed successfully!"

From f0f200adb04d7dfe409f099826368a784abb2184 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 18 Dec 2025 00:12:28 -0500
Subject: [PATCH 11/15] change to mmlu_llama, gsm8k_llama

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../auto_round/llama3/quantize.py             | 79 +++++++++----------
 .../auto_round/llama3/run_benchmark.sh        | 73 +++++++++++------
 2 files changed, 88 insertions(+), 64 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
index 3ada4fe97bf..f51fb19a8c6 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
@@ -65,58 +65,57 @@ def dispatch_model_on_devices(model):
     return model
 
 
+
 @torch.no_grad()
-def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
+def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=None):
     os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-    eval_tasks = copy.deepcopy(tasks)  # avoid removing gsm8k from original list
     all_accuracy = {}
-    test_gsm8k = False
-    test_normal = False
-    if "gsm8k" in eval_tasks:
-        test_gsm8k = True
-        eval_tasks.remove("gsm8k")
-    if eval_tasks:
-        test_normal = True
+    special_tasks = []
+    normal_tasks = []
+    # Identify special tasks
+    for t in eval_tasks:
+        if t in ["gsm8k_llama", "mmlu_llama"]:
+            special_tasks.append(t)
+        else:
+            normal_tasks.append(t)
     import lm_eval
     from lm_eval.models.huggingface import HFLM
 
-    ########################## gms8k (ahead of normal tasks) #########################
-    if test_gsm8k:
-        if tokenizer.chat_template:
-            apply_chat_template, fewshot_as_multiturn = True, True
-        else:
-            apply_chat_template, fewshot_as_multiturn = False, False
-        lm = HFLM(
-            pretrained=model_name_or_path,
-            tokenizer=tokenizer,
-            add_bos_token=True,
-            batch_size=args.eval_batch_size,
-        )
-        results_gsm8k = lm_eval.simple_evaluate(
+    lm = HFLM(
+        pretrained=model_name_or_path,
+        tokenizer=tokenizer,
+        add_bos_token=True,
+        batch_size=args.eval_batch_size,
+    )
+    # Run special tasks with chat template
+    for special_task in special_tasks:
+        results_special = lm_eval.simple_evaluate(
             lm,
-            tasks=["gsm8k"],
-            apply_chat_template=apply_chat_template,
-            fewshot_as_multiturn=fewshot_as_multiturn,
+            tasks=[special_task],
+            apply_chat_template=True,
+            fewshot_as_multiturn=True,
             limit=args.limit if limit is None else limit,
         )
-        for task_name, task_results in results_gsm8k["results"].items():
-            accu = task_results["exact_match,strict-match"]
-            all_accuracy[task_name] = accu
-    ########################## gms8k end #########################
-    if test_normal:
-        lm = HFLM(
-            pretrained=model_name_or_path,
-            tokenizer=tokenizer,
-            add_bos_token=True,
-            batch_size=args.eval_batch_size,
-        )
+        for task_name, task_results in results_special["results"].items():
+            # gsm8k_llama uses exact_match,strict-match, mmlu_llama may use acc,none
+            if task_name in special_tasks:
+                if "exact_match,strict_match" in task_results:
+                    accu = task_results["exact_match,strict_match"]
+                elif "acc,none" in task_results:
+                    accu = task_results["acc,none"]
+                else:
+                    accu = list(task_results.values())[0]
+                all_accuracy[task_name] = accu
+
+    # Run normal tasks without chat template
+    if normal_tasks:
         results = lm_eval.simple_evaluate(
             lm,
-            tasks=eval_tasks,
+            tasks=normal_tasks,
             limit=args.limit if limit is None else limit,
         )
         for task_name, task_results in results["results"].items():
-            if "acc,none" in task_results and task_name in eval_tasks:
+            if "acc,none" in task_results and task_name in normal_tasks:
                 accu = task_results["acc,none"]
                 all_accuracy[task_name] = accu
     for task_name, accu in all_accuracy.items():
@@ -191,8 +190,8 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
         default=[
             "piqa",
             "hellaswag",
-            "mmlu",
-            "gsm8k",
+            "mmlu_llama",
+            "gsm8k_llama",
         ],
         help="tasks for accuracy validation, text-generation and code-generation tasks are different.",
     )
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index 63a361b3994..6a07fbd9991 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -3,7 +3,7 @@
 # Usage: CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=<path_to_quantized_model> [--tasks=<tasks>] [--batch_size=<size>]
 
 # Parse command line arguments
-TASKS="piqa,hellaswag,mmlu,gsm8k"
+TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama"
 BATCH_SIZE=64
 GPU_MEMORY_UTILIZATION=0.8
 
@@ -83,39 +83,64 @@ run_evaluation() {
         --tasks $tasks \
         --batch_size $BATCH_SIZE \
         $extra_args
-        
+
     if [[ $? -ne 0 ]]; then
         echo "Error: Evaluation failed for tasks: $tasks"
         return 1
     fi
 }
 
-# Check if tasks contain gsm8k
-if [[ "$TASKS" == *"gsm8k"* ]]; then
-    # If gsm8k is the only task
-    if [[ "$TASKS" == "gsm8k" ]]; then
-        run_evaluation "$TASKS" true "--apply_chat_template --fewshot_as_multiturn"
+
+# Check if tasks contain gsm8k_llama or mmlu_llama
+NEED_SPLIT=false
+OTHER_TASKS="$TASKS"
+SPECIAL_TASKS=""
+
+if [[ "$TASKS" == *"gsm8k_llama"* ]]; then
+    SPECIAL_TASKS="gsm8k_llama"
+    OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*gsm8k_llama,*//' | sed 's/^,//' | sed 's/,$//')
+    NEED_SPLIT=true
+fi
+if [[ "$TASKS" == *"mmlu_llama"* ]]; then
+    if [[ -n "$SPECIAL_TASKS" ]]; then
+        SPECIAL_TASKS="$SPECIAL_TASKS,mmlu_llama"
     else
-        # Split tasks: run gsm8k separately
-        OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//')
-        
-        if [[ -n "$OTHER_TASKS" ]]; then
-            echo "Running general tasks"
-            run_evaluation "$OTHER_TASKS" true ""
-            
-            if [[ $? -eq 0 ]]; then
-                echo "Running GSM8K with chat template"
-                run_evaluation "gsm8k" true "--apply_chat_template --fewshot_as_multiturn"
-            else
-                echo "Skipping GSM8K due to previous failure"
-                exit 1
-            fi
+        SPECIAL_TASKS="mmlu_llama"
+    fi
+    OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*mmlu_llama,*//' | sed 's/^,//' | sed 's/,$//')
+    NEED_SPLIT=true
+fi
+
+if [[ "$NEED_SPLIT" == true ]]; then
+    if [[ -n "$OTHER_TASKS" ]]; then
+        echo "Running general tasks"
+        run_evaluation "$OTHER_TASKS" true ""
+        if [[ $? -eq 0 ]]; then
+            IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS"
+            for special_task in "${SPECIAL_ARRAY[@]}"; do
+                echo "Running $special_task with chat template"
+                run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn"
+                if [[ $? -ne 0 ]]; then
+                    echo "Benchmark failed on $special_task!"
+                    exit 1
+                fi
+            done
         else
-            run_evaluation "gsm8k" true "--apply_chat_template --fewshot_as_multiturn"
+            echo "Skipping special tasks due to previous failure"
+            exit 1
         fi
+    else
+        IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS"
+        for special_task in "${SPECIAL_ARRAY[@]}"; do
+            echo "Running $special_task with chat template"
+            run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn"
+            if [[ $? -ne 0 ]]; then
+                echo "Benchmark failed on $special_task!"
+                exit 1
+            fi
+        done
     fi
 else
-    # No gsm8k task
     run_evaluation "$TASKS" true ""
 fi
 
@@ -124,4 +149,4 @@ if [[ $? -eq 0 ]]; then
 else
     echo "Benchmark failed!"
     exit 1
-fi
\ No newline at end of file
+fi

From e1c57467366b0ed392d283640ac0165fd8b71a9f Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 18 Dec 2025 21:19:48 -0500
Subject: [PATCH 12/15] recover bits=5.8 and rtn for nvfp4

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/README.md               | 4 ++--
 .../quantization/auto_round/llama3/run_quant.sh            | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
index a2b91e4a83b..c90a77b1ad2 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -135,9 +135,9 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8
 
 > Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
 
-#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=6.0)
+#### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8)
 
-`Target_bits=6.0` is an empirical value.
+`Target_bits=5.8` is an empirical value.
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp4_mixed --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP4-MXFP8
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index ad6ec55354c..46d7581033b 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -117,12 +117,12 @@ case "$TOPOLOGY" in
                 ;;
             "mxfp4_mixed")
                 echo "Running Llama 3.3 70B MXFP4 (Mixed with MXFP8) quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 6.0 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
-                    --target_bits 6.0 \
+                    --target_bits 5.8 \
                     --options "MXFP4" "MXFP8" \
                     --shared_layers "k_proj" "v_proj" "q_proj" \
                     --shared_layers "gate_proj" "up_proj" \
@@ -150,12 +150,13 @@ case "$TOPOLOGY" in
                 ;;
             "nvfp4")
                 echo "Running Llama 3.1 70B NVFP4 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype NVFP4 \
+                    --iters 0 \
                     --export_format llm_compressor \
                     --export_path "$OUTPUT_MODEL"
                 ;;

From 906c259fd89e95e866aaef76cedd95f0c89292b7 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 19 Dec 2025 00:51:07 -0500
Subject: [PATCH 13/15] add autoround tuning

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/README.md                  | 4 ++--
 .../quantization/auto_round/llama3/run_quant.sh               | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
index c90a77b1ad2..da07f4176ae 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -155,10 +155,10 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8
 
 #### Llama 3.1 70B NVFP4
 
-RTN (Round-to-Nearest) is enough to keep accuracy.
+AutoRound tuning helps improve the accuracy.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4
+CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4
 ```
 
 > Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index 46d7581033b..fa02b476739 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -150,13 +150,12 @@ case "$TOPOLOGY" in
                 ;;
             "nvfp4")
                 echo "Running Llama 3.1 70B NVFP4 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype NVFP4 \
-                    --iters 0 \
                     --export_format llm_compressor \
                     --export_path "$OUTPUT_MODEL"
                 ;;

From c198b0b2757e3f45f1796432a5bc3ec27d5dd193 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Sun, 21 Dec 2025 06:13:40 -0500
Subject: [PATCH 14/15] remove torch_compile for nvfp4

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../quantization/auto_round/llama3/run_quant.sh              | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index fa02b476739..5ac00da274a 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -150,11 +150,12 @@ case "$TOPOLOGY" in
                 ;;
             "nvfp4")
                 echo "Running Llama 3.1 70B NVFP4 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" --quantize --low_gpu_mem_usage --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
-                    $COMMON_ARGS \
+                    --quantize \
+                    --low_gpu_mem_usage \
                     --dtype NVFP4 \
                     --export_format llm_compressor \
                     --export_path "$OUTPUT_MODEL"

From 100743a1b027cd2d5a6d163562d5848e9c57a4a1 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Sun, 21 Dec 2025 06:15:37 -0500
Subject: [PATCH 15/15] add 1 more card for nvfp4 quant

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 .../language-modeling/quantization/auto_round/llama3/README.md  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
index da07f4176ae..a7872952696 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -158,7 +158,7 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8
 AutoRound tuning helps improve the accuracy.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4
+CUDA_VISIBLE_DEVICES=0,1 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4
 ```
 
 > Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.