diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 2faac68dabe..0e6db647413 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -79,6 +79,8 @@ Notes: Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%. > Note: You can also enable static quantization for KV cache by adding `--static_kv_dtype fp8` argument to `quantize.py`, or `--static_kv_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`. +> +> You can also enable static quantization for attention by adding `--static_attention_dtype fp8` argument to `quantize.py`, or `--static_attention_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`. When enabled, it automatically sets KV cache dtype to fp8 as well. #### Llama 3.1 8B MXFP8 @@ -210,8 +212,10 @@ CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 The script automatically: - Detects available GPUs from `CUDA_VISIBLE_DEVICES` and sets `tensor_parallel_size` accordingly -- Runs default tasks: `piqa,hellaswag,mmlu_llama,gsm8k_llama` with batch size 8 +- Runs default tasks: `piqa,hellaswag,mmlu_llama,gsm8k_llama` with batch size 64 - Supports custom task selection and batch size adjustment +- Handles special tasks like `mmlu_llama`, `gsm8k_llama` (with chat template) and `longbench` (with extended context length) automatically +- For longbench dataset evaluation, use the `--tasks=longbench` parameter ### NVFP4 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index feff13f9d20..158cde59ddd 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -169,6 +169,13 @@ def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=No choices=["fp8", "float8_e4m3fn"], help="Data type for static quantize key and value.", ) + parser.add_argument( + "--static_attention_dtype", + default=None, + type=str, + choices=["fp8", "float8_e4m3fn"], + help="Data type for static quantize key and value.", + ) parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model") parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file") parser.add_argument("--iters", default=200, type=int, help="iters for autoround.") @@ -256,6 +263,7 @@ def load_recipe_results(file_path): options=args.options, shared_layers=args.shared_layers, static_kv_dtype=args.static_kv_dtype, + static_attention_dtype=args.static_attention_dtype, enable_torch_compile=args.enable_torch_compile, low_gpu_mem_usage=args.low_gpu_mem_usage, export_format=args.export_format, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt index 842aeea062d..8f2564787b4 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt @@ -1,7 +1,10 @@ transformers==4.57.3 torch==2.9.0 torchvision==0.24.0 -lm_eval==0.4.9.2 +lm_eval>=0.4.9.2 datasets==4.4.2 auto-round==0.9.3 neural-compressor-pt>=3.7 +jieba +fuzzywuzzy +rouge diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 6aaa3a4da56..6127f968733 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -7,6 +7,7 @@ TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama" BATCH_SIZE=64 GPU_MEMORY_UTILIZATION=0.8 KV_CACHE_DTYPE="auto" +ATTN_DTYPE="auto" while [[ $# -gt 0 ]]; do case $1 in @@ -30,6 +31,10 @@ while [[ $# -gt 0 ]]; do KV_CACHE_DTYPE="${1#*=}" shift ;; + --static_attention_dtype=*) + ATTN_DTYPE="${1#*=}" + shift + ;; *) echo "Unknown parameter: $1" exit 1 @@ -44,6 +49,14 @@ if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then echo "Using FP8 for KV cache" fi +# for fp8 attention cache +if [[ "$ATTN_DTYPE" == "fp8" ]]; then + export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=0 + export VLLM_ATTENTION_BACKEND="FLASHINFER" + KV_CACHE_DTYPE="fp8" + echo "Using FP8 Attention" +fi + # Validate required parameters if [[ -z "$MODEL_PATH" ]]; then echo "Usage: bash run_benchmark.sh --model_path= [--tasks=] [--batch_size=]" @@ -103,10 +116,11 @@ run_evaluation() { } -# Check if tasks contain gsm8k_llama or mmlu_llama +# Check if tasks contain gsm8k_llama, mmlu_llama, or longbench NEED_SPLIT=false OTHER_TASKS="$TASKS" SPECIAL_TASKS="" +LONGBENCH_TASK="" if [[ "$TASKS" == *"gsm8k_llama"* ]]; then SPECIAL_TASKS="gsm8k_llama" @@ -122,26 +136,24 @@ if [[ "$TASKS" == *"mmlu_llama"* ]]; then OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*mmlu_llama,*//' | sed 's/^,//' | sed 's/,$//') NEED_SPLIT=true fi +if [[ "$TASKS" == *"longbench"* ]]; then + LONGBENCH_TASK="longbench" + OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*longbench,*//' | sed 's/^,//' | sed 's/,$//') + NEED_SPLIT=true +fi if [[ "$NEED_SPLIT" == true ]]; then if [[ -n "$OTHER_TASKS" ]]; then echo "Running general tasks" run_evaluation "$OTHER_TASKS" true "" - if [[ $? -eq 0 ]]; then - IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS" - for special_task in "${SPECIAL_ARRAY[@]}"; do - echo "Running $special_task with chat template" - run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn" - if [[ $? -ne 0 ]]; then - echo "Benchmark failed on $special_task!" - exit 1 - fi - done - else + if [[ $? -ne 0 ]]; then echo "Skipping special tasks due to previous failure" exit 1 fi - else + fi + + # Run special tasks (gsm8k_llama, mmlu_llama) + if [[ -n "$SPECIAL_TASKS" ]]; then IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS" for special_task in "${SPECIAL_ARRAY[@]}"; do echo "Running $special_task with chat template" @@ -152,6 +164,26 @@ if [[ "$NEED_SPLIT" == true ]]; then fi done fi + + # Run longbench task with special configuration + if [[ -n "$LONGBENCH_TASK" ]]; then + echo "Running longbench with special configuration" + local longbench_cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",trust_remote_code=True,dtype=bfloat16,max_model_len=66000,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,enable_prefix_caching=False --tasks longbench --seed 42 --batch_size $BATCH_SIZE --apply_chat_template --gen_kwargs '{\"temperature\":0.0}'" + echo "Executing command: $longbench_cmd" + + lm_eval --model vllm \ + --model_args pretrained="$MODEL_PATH",trust_remote_code=True,dtype=bfloat16,max_model_len=66000,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,enable_prefix_caching=False \ + --tasks longbench \ + --seed 42 \ + --batch_size $BATCH_SIZE \ + --apply_chat_template \ + --gen_kwargs '{"temperature":0.0}' + + if [[ $? -ne 0 ]]; then + echo "Benchmark failed on longbench!" + exit 1 + fi + fi else run_evaluation "$TASKS" true "" fi diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index a53443831f0..9a18c274716 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -4,6 +4,7 @@ # Parse command line arguments KV_CACHE_DTYPE="auto" +STATIC_ATTENTION_DTYPE="auto" while [[ $# -gt 0 ]]; do case $1 in --topology=*) @@ -26,6 +27,10 @@ while [[ $# -gt 0 ]]; do KV_CACHE_DTYPE="${1#*=}" shift ;; + --static_attention_dtype=*) + STATIC_ATTENTION_DTYPE="${1#*=}" + shift + ;; *) echo "Unknown parameter: $1" exit 1 @@ -48,10 +53,12 @@ echo " Input Model: $INPUT_MODEL" echo " Output Model: $OUTPUT_MODEL" # Set common parameters -if [ "$KV_CACHE_DTYPE" = "auto" ]; then - COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round" -else - COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round --static_kv_dtype $KV_CACHE_DTYPE" +COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round" +if [ "$KV_CACHE_DTYPE" != "auto" ]; then + COMMON_ARGS="$COMMON_ARGS --static_kv_dtype $KV_CACHE_DTYPE" +fi +if [ "$STATIC_ATTENTION_DTYPE" != "auto" ]; then + COMMON_ARGS="$COMMON_ARGS --static_attention_dtype $STATIC_ATTENTION_DTYPE" fi case "$TOPOLOGY" in