fix llama3 OOM issue and lm_head unsupport issue (#2360)

xin3he · web-flow · commit 997f7ed4f93f · 2025-12-23T14:17:48.000+08:00
Signed-off-by: He, Xin3 &lt;xin3.he@intel.com&gt;
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -88,6 +88,8 @@ Notes:
 
 ### Llama3 Quantization Recipes
 
+Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%.
+
 #### Llama 3.1 8B MXFP8
 
 AutoRound tuning helps improve the accuracy, `iters` and `nsamples` is higher than default.
@@ -131,6 +133,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy.
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP8
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8)
 
 `Target_bits=5.8` is an empirical value.
@@ -147,14 +151,18 @@ RTN (Round-to-Nearest) is enough to keep accuracy.
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-MXFP8
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.1 70B NVFP4
 
-RTN (Round-to-Nearest) is enough to keep accuracy.
+AutoRound tuning helps improve the accuracy.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4
+CUDA_VISIBLE_DEVICES=0,1 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.1 70B uNVFP4
 
 RTN (Round-to-Nearest) is enough to keep accuracy.
@@ -186,27 +194,27 @@ For convenience, we provide a benchmark script that automatically handles GPU de
 
 1. **Llama 3.1 8B MXFP8** (1 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8
+CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 --gpu_memory_utilization=0.8
 ```
 
 2. **Llama 3.1 8B MXFP4 Mixed** (1 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8
+CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8  --gpu_memory_utilization=0.6
 ```
 
-3. **Llama 3.3 70B MXFP8** (4 GPU):
+3. **Llama 3.3 70B MXFP8** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8  --gpu_memory_utilization=0.8
 ```
 
-4. **Llama 3.3 70B MXFP4 Mixed** (4 GPU):
+4. **Llama 3.3 70B MXFP4 Mixed** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8  --gpu_memory_utilization=0.6
 ```
 
-5. **Llama 3.1 70B MXFP8** (4 GPU):
+5. **Llama 3.1 70B MXFP8** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8   --gpu_memory_utilization=0.8
 ```
 
 The script automatically:
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
@@ -65,52 +65,57 @@ def dispatch_model_on_devices(model):
     return model
 
 
+
 @torch.no_grad()
-def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
+def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=None):
     os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-    eval_tasks = copy.deepcopy(tasks)  # avoid removing gsm8k from original list
     all_accuracy = {}
-    test_gsm8k = False
-    test_normal = False
-    if "gsm8k" in eval_tasks:
-        test_gsm8k = True
-        eval_tasks.remove("gsm8k")
-    if eval_tasks:
-        test_normal = True
+    special_tasks = []
+    normal_tasks = []
+    # Identify special tasks
+    for t in eval_tasks:
+        if t in ["gsm8k_llama", "mmlu_llama"]:
+            special_tasks.append(t)
+        else:
+            normal_tasks.append(t)
     import lm_eval
     from lm_eval.models.huggingface import HFLM
 
-    ########################## gms8k (ahead of normal tasks) #########################
-    if test_gsm8k:
-        lm = HFLM(
-            pretrained=model_name_or_path,
-            tokenizer=tokenizer,
-            add_bos_token=False,
-            batch_size=args.eval_batch_size,
-        )
-        results_gsm8k = lm_eval.simple_evaluate(
+    lm = HFLM(
+        pretrained=model_name_or_path,
+        tokenizer=tokenizer,
+        add_bos_token=True,
+        batch_size=args.eval_batch_size,
+    )
+    # Run special tasks with chat template
+    for special_task in special_tasks:
+        results_special = lm_eval.simple_evaluate(
             lm,
-            tasks=["gsm8k"],
+            tasks=[special_task],
+            apply_chat_template=True,
+            fewshot_as_multiturn=True,
             limit=args.limit if limit is None else limit,
         )
-        for task_name, task_results in results_gsm8k["results"].items():
-            accu = task_results["exact_match,strict-match"]
-            all_accuracy[task_name] = accu
-    ########################## gms8k end #########################
-    if test_normal:
-        lm = HFLM(
-            pretrained=model_name_or_path,
-            tokenizer=tokenizer,
-            add_bos_token=True,
-            batch_size=args.eval_batch_size,
-        )
+        for task_name, task_results in results_special["results"].items():
+            # gsm8k_llama uses exact_match,strict-match, mmlu_llama may use acc,none
+            if task_name in special_tasks:
+                if "exact_match,strict_match" in task_results:
+                    accu = task_results["exact_match,strict_match"]
+                elif "acc,none" in task_results:
+                    accu = task_results["acc,none"]
+                else:
+                    accu = list(task_results.values())[0]
+                all_accuracy[task_name] = accu
+
+    # Run normal tasks without chat template
+    if normal_tasks:
         results = lm_eval.simple_evaluate(
             lm,
-            tasks=eval_tasks,
+            tasks=normal_tasks,
             limit=args.limit if limit is None else limit,
         )
         for task_name, task_results in results["results"].items():
-            if "acc,none" in task_results and task_name in eval_tasks:
+            if "acc,none" in task_results and task_name in normal_tasks:
                 accu = task_results["acc,none"]
                 all_accuracy[task_name] = accu
     for task_name, accu in all_accuracy.items():
@@ -150,7 +155,7 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
         help="options for mix precision"
     )
     parser.add_argument(
-        "--shared_layer",
+        "--shared_layers",
         type=str,
         nargs="+",
         action='append',
@@ -185,8 +190,8 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
         default=[
             "piqa",
             "hellaswag",
-            "mmlu",
-            "gsm8k",
+            "mmlu_llama",
+            "gsm8k_llama",
         ],
         help="tasks for accuracy validation, text-generation and code-generation tasks are different.",
     )
@@ -198,7 +203,7 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None):
         print("Target data type:", args.dtype)
     else:
         print("Target data type for mix precision:", args.options)
-        print("Layers sharing the same data type:", args.shared_layer)
+        print("Layers sharing the same data type:", args.shared_layers)
     model, tokenizer = initialize_model_and_tokenizer(args.model_name_or_path)
 
     if args.quantize:
@@ -242,7 +247,7 @@ def load_recipe_results(file_path):
             scheme=args.dtype,
             target_bits=args.target_bits,
             options=args.options,
-            shared_layers=args.shared_layer,
+            shared_layers=args.shared_layers,
             enable_torch_compile=args.enable_torch_compile,
             low_gpu_mem_usage=args.low_gpu_mem_usage,
             export_format=args.export_format,
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -3,8 +3,9 @@
 # Usage: CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=<path_to_quantized_model> [--tasks=<tasks>] [--batch_size=<size>]
 
 # Parse command line arguments
-TASKS="piqa,hellaswag,mmlu,gsm8k"
-BATCH_SIZE=8
+TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama"
+BATCH_SIZE=64
+GPU_MEMORY_UTILIZATION=0.8
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -20,6 +21,10 @@ while [[ $# -gt 0 ]]; do
             BATCH_SIZE="${1#*=}"
             shift
             ;;
+        --gpu_memory_utilization=*)
+            GPU_MEMORY_UTILIZATION="${1#*=}"
+            shift
+            ;;
         *)
             echo "Unknown parameter: $1"
             exit 1
@@ -48,6 +53,7 @@ echo "  Model Path: $MODEL_PATH"
 echo "  Tasks: $TASKS"
 echo "  Batch Size: $BATCH_SIZE"
 echo "  Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
+echo "  GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
 echo "  CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
 
 # Check if the model exists
@@ -64,56 +70,83 @@ export TORCH_COMPILE_DISABLE=1
 run_evaluation() {
     local tasks=$1
     local add_bos_token=$2
+    local extra_args=$3
     
     echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)"
     
     # Print the command being executed
-    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE"
+    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE $extra_args"
     echo "Executing command: $cmd"
     
     lm_eval --model vllm \
-        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 \
+        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \
         --tasks $tasks \
-        --batch_size $BATCH_SIZE
-        
+        --batch_size $BATCH_SIZE \
+        $extra_args
+
     if [[ $? -ne 0 ]]; then
         echo "Error: Evaluation failed for tasks: $tasks"
         return 1
     fi
 }
 
-# Check if tasks contain gsm8k (requires add_bos_token=False)
-if [[ "$TASKS" == *"gsm8k"* ]]; then
-    # If gsm8k is the only task
-    if [[ "$TASKS" == "gsm8k" ]]; then
-        run_evaluation "$TASKS" false
+
+# Check if tasks contain gsm8k_llama or mmlu_llama
+NEED_SPLIT=false
+OTHER_TASKS="$TASKS"
+SPECIAL_TASKS=""
+
+if [[ "$TASKS" == *"gsm8k_llama"* ]]; then
+    SPECIAL_TASKS="gsm8k_llama"
+    OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*gsm8k_llama,*//' | sed 's/^,//' | sed 's/,$//')
+    NEED_SPLIT=true
+fi
+if [[ "$TASKS" == *"mmlu_llama"* ]]; then
+    if [[ -n "$SPECIAL_TASKS" ]]; then
+        SPECIAL_TASKS="$SPECIAL_TASKS,mmlu_llama"
     else
-        # Split tasks: run gsm8k separately with add_bos_token=False
-        OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//')
-        
-        if [[ -n "$OTHER_TASKS" ]]; then
-            echo "Running general tasks with add_bos_token=True"
-            run_evaluation "$OTHER_TASKS" true
-            
-            if [[ $? -eq 0 ]]; then
-                echo "Running GSM8K with add_bos_token=False"
-                run_evaluation "gsm8k" false
-            else
-                echo "Skipping GSM8K due to previous failure"
-                exit 1
-            fi
+        SPECIAL_TASKS="mmlu_llama"
+    fi
+    OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*mmlu_llama,*//' | sed 's/^,//' | sed 's/,$//')
+    NEED_SPLIT=true
+fi
+
+if [[ "$NEED_SPLIT" == true ]]; then
+    if [[ -n "$OTHER_TASKS" ]]; then
+        echo "Running general tasks"
+        run_evaluation "$OTHER_TASKS" true ""
+        if [[ $? -eq 0 ]]; then
+            IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS"
+            for special_task in "${SPECIAL_ARRAY[@]}"; do
+                echo "Running $special_task with chat template"
+                run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn"
+                if [[ $? -ne 0 ]]; then
+                    echo "Benchmark failed on $special_task!"
+                    exit 1
+                fi
+            done
         else
-            run_evaluation "gsm8k" false
+            echo "Skipping special tasks due to previous failure"
+            exit 1
         fi
+    else
+        IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS"
+        for special_task in "${SPECIAL_ARRAY[@]}"; do
+            echo "Running $special_task with chat template"
+            run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn"
+            if [[ $? -ne 0 ]]; then
+                echo "Benchmark failed on $special_task!"
+                exit 1
+            fi
+        done
     fi
 else
-    # No gsm8k task, use add_bos_token=True for all tasks
-    run_evaluation "$TASKS" true
+    run_evaluation "$TASKS" true ""
 fi
 
 if [[ $? -eq 0 ]]; then
     echo "Benchmark completed successfully!"
 else
     echo "Benchmark failed!"
     exit 1
-fi
+fi
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh