diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 1b3a01172ee..a7872952696 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -88,6 +88,8 @@ Notes: ### Llama3 Quantization Recipes +Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%. + #### Llama 3.1 8B MXFP8 AutoRound tuning helps improve the accuracy, `iters` and `nsamples` is higher than default. @@ -131,6 +133,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy. CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP8 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8) `Target_bits=5.8` is an empirical value. @@ -147,14 +151,18 @@ RTN (Round-to-Nearest) is enough to keep accuracy. CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-MXFP8 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.1 70B NVFP4 -RTN (Round-to-Nearest) is enough to keep accuracy. +AutoRound tuning helps improve the accuracy. ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4 +CUDA_VISIBLE_DEVICES=0,1 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4 ``` +> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference. + #### Llama 3.1 70B uNVFP4 RTN (Round-to-Nearest) is enough to keep accuracy. @@ -186,27 +194,27 @@ For convenience, we provide a benchmark script that automatically handles GPU de 1. **Llama 3.1 8B MXFP8** (1 GPU): ```bash -CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 --gpu_memory_utilization=0.8 ``` 2. **Llama 3.1 8B MXFP4 Mixed** (1 GPU): ```bash -CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8 +CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8 --gpu_memory_utilization=0.6 ``` -3. **Llama 3.3 70B MXFP8** (4 GPU): +3. **Llama 3.3 70B MXFP8** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8 --gpu_memory_utilization=0.8 ``` -4. **Llama 3.3 70B MXFP4 Mixed** (4 GPU): +4. **Llama 3.3 70B MXFP4 Mixed** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8 --gpu_memory_utilization=0.6 ``` -5. **Llama 3.1 70B MXFP8** (4 GPU): +5. **Llama 3.1 70B MXFP8** (2 GPU): ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 +CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 --gpu_memory_utilization=0.8 ``` The script automatically: diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index 7824425bb0e..f51fb19a8c6 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -65,52 +65,57 @@ def dispatch_model_on_devices(model): return model + @torch.no_grad() -def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): +def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=None): os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") - eval_tasks = copy.deepcopy(tasks) # avoid removing gsm8k from original list all_accuracy = {} - test_gsm8k = False - test_normal = False - if "gsm8k" in eval_tasks: - test_gsm8k = True - eval_tasks.remove("gsm8k") - if eval_tasks: - test_normal = True + special_tasks = [] + normal_tasks = [] + # Identify special tasks + for t in eval_tasks: + if t in ["gsm8k_llama", "mmlu_llama"]: + special_tasks.append(t) + else: + normal_tasks.append(t) import lm_eval from lm_eval.models.huggingface import HFLM - ########################## gms8k (ahead of normal tasks) ######################### - if test_gsm8k: - lm = HFLM( - pretrained=model_name_or_path, - tokenizer=tokenizer, - add_bos_token=False, - batch_size=args.eval_batch_size, - ) - results_gsm8k = lm_eval.simple_evaluate( + lm = HFLM( + pretrained=model_name_or_path, + tokenizer=tokenizer, + add_bos_token=True, + batch_size=args.eval_batch_size, + ) + # Run special tasks with chat template + for special_task in special_tasks: + results_special = lm_eval.simple_evaluate( lm, - tasks=["gsm8k"], + tasks=[special_task], + apply_chat_template=True, + fewshot_as_multiturn=True, limit=args.limit if limit is None else limit, ) - for task_name, task_results in results_gsm8k["results"].items(): - accu = task_results["exact_match,strict-match"] - all_accuracy[task_name] = accu - ########################## gms8k end ######################### - if test_normal: - lm = HFLM( - pretrained=model_name_or_path, - tokenizer=tokenizer, - add_bos_token=True, - batch_size=args.eval_batch_size, - ) + for task_name, task_results in results_special["results"].items(): + # gsm8k_llama uses exact_match,strict-match, mmlu_llama may use acc,none + if task_name in special_tasks: + if "exact_match,strict_match" in task_results: + accu = task_results["exact_match,strict_match"] + elif "acc,none" in task_results: + accu = task_results["acc,none"] + else: + accu = list(task_results.values())[0] + all_accuracy[task_name] = accu + + # Run normal tasks without chat template + if normal_tasks: results = lm_eval.simple_evaluate( lm, - tasks=eval_tasks, + tasks=normal_tasks, limit=args.limit if limit is None else limit, ) for task_name, task_results in results["results"].items(): - if "acc,none" in task_results and task_name in eval_tasks: + if "acc,none" in task_results and task_name in normal_tasks: accu = task_results["acc,none"] all_accuracy[task_name] = accu for task_name, accu in all_accuracy.items(): @@ -150,7 +155,7 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): help="options for mix precision" ) parser.add_argument( - "--shared_layer", + "--shared_layers", type=str, nargs="+", action='append', @@ -185,8 +190,8 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): default=[ "piqa", "hellaswag", - "mmlu", - "gsm8k", + "mmlu_llama", + "gsm8k_llama", ], help="tasks for accuracy validation, text-generation and code-generation tasks are different.", ) @@ -198,7 +203,7 @@ def get_accuracy(model_name_or_path, tokenizer=None, tasks="mmlu", limit=None): print("Target data type:", args.dtype) else: print("Target data type for mix precision:", args.options) - print("Layers sharing the same data type:", args.shared_layer) + print("Layers sharing the same data type:", args.shared_layers) model, tokenizer = initialize_model_and_tokenizer(args.model_name_or_path) if args.quantize: @@ -242,7 +247,7 @@ def load_recipe_results(file_path): scheme=args.dtype, target_bits=args.target_bits, options=args.options, - shared_layers=args.shared_layer, + shared_layers=args.shared_layers, enable_torch_compile=args.enable_torch_compile, low_gpu_mem_usage=args.low_gpu_mem_usage, export_format=args.export_format, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 87b635be52f..6a07fbd9991 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -3,8 +3,9 @@ # Usage: CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path= [--tasks=] [--batch_size=] # Parse command line arguments -TASKS="piqa,hellaswag,mmlu,gsm8k" -BATCH_SIZE=8 +TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama" +BATCH_SIZE=64 +GPU_MEMORY_UTILIZATION=0.8 while [[ $# -gt 0 ]]; do case $1 in @@ -20,6 +21,10 @@ while [[ $# -gt 0 ]]; do BATCH_SIZE="${1#*=}" shift ;; + --gpu_memory_utilization=*) + GPU_MEMORY_UTILIZATION="${1#*=}" + shift + ;; *) echo "Unknown parameter: $1" exit 1 @@ -48,6 +53,7 @@ echo " Model Path: $MODEL_PATH" echo " Tasks: $TASKS" echo " Batch Size: $BATCH_SIZE" echo " Tensor Parallel Size: $TENSOR_PARALLEL_SIZE" +echo " GPU Memory Utilization: $GPU_MEMORY_UTILIZATION" echo " CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" # Check if the model exists @@ -64,51 +70,78 @@ export TORCH_COMPILE_DISABLE=1 run_evaluation() { local tasks=$1 local add_bos_token=$2 + local extra_args=$3 echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" # Print the command being executed - local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE" + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE $extra_args" echo "Executing command: $cmd" lm_eval --model vllm \ - --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 \ + --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \ --tasks $tasks \ - --batch_size $BATCH_SIZE - + --batch_size $BATCH_SIZE \ + $extra_args + if [[ $? -ne 0 ]]; then echo "Error: Evaluation failed for tasks: $tasks" return 1 fi } -# Check if tasks contain gsm8k (requires add_bos_token=False) -if [[ "$TASKS" == *"gsm8k"* ]]; then - # If gsm8k is the only task - if [[ "$TASKS" == "gsm8k" ]]; then - run_evaluation "$TASKS" false + +# Check if tasks contain gsm8k_llama or mmlu_llama +NEED_SPLIT=false +OTHER_TASKS="$TASKS" +SPECIAL_TASKS="" + +if [[ "$TASKS" == *"gsm8k_llama"* ]]; then + SPECIAL_TASKS="gsm8k_llama" + OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*gsm8k_llama,*//' | sed 's/^,//' | sed 's/,$//') + NEED_SPLIT=true +fi +if [[ "$TASKS" == *"mmlu_llama"* ]]; then + if [[ -n "$SPECIAL_TASKS" ]]; then + SPECIAL_TASKS="$SPECIAL_TASKS,mmlu_llama" else - # Split tasks: run gsm8k separately with add_bos_token=False - OTHER_TASKS=$(echo "$TASKS" | sed 's/,*gsm8k,*//' | sed 's/^,//' | sed 's/,$//') - - if [[ -n "$OTHER_TASKS" ]]; then - echo "Running general tasks with add_bos_token=True" - run_evaluation "$OTHER_TASKS" true - - if [[ $? -eq 0 ]]; then - echo "Running GSM8K with add_bos_token=False" - run_evaluation "gsm8k" false - else - echo "Skipping GSM8K due to previous failure" - exit 1 - fi + SPECIAL_TASKS="mmlu_llama" + fi + OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*mmlu_llama,*//' | sed 's/^,//' | sed 's/,$//') + NEED_SPLIT=true +fi + +if [[ "$NEED_SPLIT" == true ]]; then + if [[ -n "$OTHER_TASKS" ]]; then + echo "Running general tasks" + run_evaluation "$OTHER_TASKS" true "" + if [[ $? -eq 0 ]]; then + IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS" + for special_task in "${SPECIAL_ARRAY[@]}"; do + echo "Running $special_task with chat template" + run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn" + if [[ $? -ne 0 ]]; then + echo "Benchmark failed on $special_task!" + exit 1 + fi + done else - run_evaluation "gsm8k" false + echo "Skipping special tasks due to previous failure" + exit 1 fi + else + IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS" + for special_task in "${SPECIAL_ARRAY[@]}"; do + echo "Running $special_task with chat template" + run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn" + if [[ $? -ne 0 ]]; then + echo "Benchmark failed on $special_task!" + exit 1 + fi + done fi else - # No gsm8k task, use add_bos_token=True for all tasks - run_evaluation "$TASKS" true + run_evaluation "$TASKS" true "" fi if [[ $? -eq 0 ]]; then @@ -116,4 +149,4 @@ if [[ $? -eq 0 ]]; then else echo "Benchmark failed!" exit 1 -fi \ No newline at end of file +fi diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index d50deaf6b3c..5ac00da274a 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -73,15 +73,15 @@ case "$TOPOLOGY" in ;; "mxfp4_mixed") echo "Running Llama 3.1 8B MXFP4 (Mixed with MXFP8) quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 7.8 --options \"MXFP4\" \"MXFP8\" --shared_layer \"k_proj\" \"v_proj\" \"q_proj\" --shared_layer \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 7.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --target_bits 7.8 \ --options "MXFP4" "MXFP8" \ - --shared_layer "k_proj" "v_proj" "q_proj" \ - --shared_layer "gate_proj" "up_proj" \ + --shared_layers "k_proj" "v_proj" "q_proj" \ + --shared_layers "gate_proj" "up_proj" \ --export_path "$OUTPUT_MODEL" ;; *) @@ -95,13 +95,12 @@ case "$TOPOLOGY" in case "$DTYPE" in "mxfp8") echo "Running Llama 3.3 70B MXFP8 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype MXFP8 \ - --quant_lm_head \ --iters 0 \ --export_path "$OUTPUT_MODEL" ;; @@ -118,15 +117,15 @@ case "$TOPOLOGY" in ;; "mxfp4_mixed") echo "Running Llama 3.3 70B MXFP4 (Mixed with MXFP8) quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layer \"k_proj\" \"v_proj\" \"q_proj\" --shared_layer \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --target_bits 5.8 \ --options "MXFP4" "MXFP8" \ - --shared_layer "k_proj" "v_proj" "q_proj" \ - --shared_layer "gate_proj" "up_proj" \ + --shared_layers "k_proj" "v_proj" "q_proj" \ + --shared_layers "gate_proj" "up_proj" \ --export_path "$OUTPUT_MODEL" ;; *) @@ -140,26 +139,24 @@ case "$TOPOLOGY" in case "$DTYPE" in "mxfp8") echo "Running Llama 3.1 70B MXFP8 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype MXFP8 \ - --quant_lm_head \ --iters 0 \ --export_path "$OUTPUT_MODEL" ;; "nvfp4") echo "Running Llama 3.1 70B NVFP4 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --quant_lm_head --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" --quantize --low_gpu_mem_usage --dtype NVFP4 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ - $COMMON_ARGS \ + --quantize \ + --low_gpu_mem_usage \ --dtype NVFP4 \ - --quant_lm_head \ - --iters 0 \ --export_format llm_compressor \ --export_path "$OUTPUT_MODEL" ;; @@ -199,4 +196,4 @@ if [[ $? -eq 0 ]]; then else echo "Quantization failed!" exit 1 -fi \ No newline at end of file +fi