add fp8 attention and longbench task for llama4

mengniwang95 · mengniwang95 · commit 0341e30a34a7 · 2026-01-27T05:31:37.000Z
Signed-off-by: Mengni Wang &lt;mengni.wang@intel.com&gt;
diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -33,6 +33,8 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=L
 ```
 
 > Note: You can also enable static quantization for KV cache by adding `--static_kv_dtype fp8` argument to `main.py`， or `--static_kv_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`.
+>
+> You can also enable static quantization for attention by adding `--static_attention_dtype fp8` argument to `main.py`， or `--static_attention_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`. When enabled, it automatically sets KV cache dtype to fp8 as well.
 
 ## 2. Benchmark
 
diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py
@@ -80,7 +80,13 @@ def setup_parser():
         choices=["fp8", "float8_e4m3fn"],
         help="Data type for static quantize key and value."
     )
-
+    parser.add_argument(
+        "--static_attention_dtype",
+        default=None,
+        type=str,
+        choices=["fp8", "float8_e4m3fn"],
+        help="Data type for static quantize query, key and value."
+    )
     parser.add_argument(
         "--iters",
         "--iter",
@@ -122,6 +128,7 @@ def tune(args):
         output_dir=args.output_dir,
         processor=processor,
         static_kv_dtype=args.static_kv_dtype,
+        static_attention_dtype=args.static_attention_dtype,
         reloading=False,
     )
     model = prepare(model, qconfig)
diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/requirements.txt
@@ -2,3 +2,6 @@ lm-eval==0.4.9.1
 setuptools_scm
 torchao==0.12.0
 triton==3.3.1
+jieba
+fuzzywuzzy
+rouge
diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
@@ -30,6 +30,9 @@ function init_params {
       --static_kv_dtype=*)
           kv_cache_dtype=$(echo $var |cut -f2 -d=)
       ;;
+      --static_attention_dtype=*)
+          attention_dtype=$(echo $var |cut -f2 -d=)
+      ;;
     esac
   done
 
@@ -41,6 +44,7 @@ function run_benchmark {
     extra_model_args=""
     extra_cmd=""
     kv_cache_dtype=${kv_cache_dtype:="auto"}
+    attention_dtype=${attention_dtype:="auto"}
     batch_size=${batch_size:=1}
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
@@ -57,6 +61,10 @@ function run_benchmark {
     if [[ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]]; then
         model="vllm-vlm"
         extra_cmd=${extra_cmd}" --apply_chat_template"
+    elif [[ "${tasks}" == *"longbench"* ]]; then
+	model="vllm"
+        extra_cmd="--seed 42 --apply_chat_template --gen_kwargs {\"temperature\":0.0} "
+	extra_model_args="max_model_len=66000,gpu_memory_utilization=0.7"
     else
         model="vllm"
     fi
@@ -67,6 +75,13 @@ function run_benchmark {
         echo "Using FP8 for KV cache"
     fi
 
+    if [[ "${attention_dtype}" == "fp8" ]]; then
+        export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=0
+        export VLLM_ATTENTION_BACKEND="FLASHINFER"
+        kv_cache_dtype="fp8"
+        echo "Using FP8 Attention"
+    fi
+
     NCCL_NVLS_ENABLE=0 VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \
     lm_eval --model ${model} \
             --model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True,kv_cache_dtype=${kv_cache_dtype} \
diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
@@ -31,6 +31,9 @@ function init_params {
       --static_kv_dtype=*)
           kv_cache_dtype=$(echo $var |cut -f2 -d=)
       ;;
+      --static_attention_dtype=*)
+          attention_dtype=$(echo $var |cut -f2 -d=)
+      ;;
       *)
           echo "Error: No such parameter: ${var}"
           exit 1
@@ -46,6 +49,7 @@ function run_tuning {
     tuned_checkpoint=${tuned_checkpoint:="saved_results"}
     iters=${iters:=0}
     kv_cache_dtype=${kv_cache_dtype:="auto"}
+    attention_dtype=${attention_dtype:="auto"}
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
         extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4 --export_format auto_round"
@@ -55,6 +59,11 @@ function run_tuning {
         extra_cmd=${extra_cmd}" --static_kv_dtype ${kv_cache_dtype}"
     fi
 
+    if [[ ! "${attention_dtype}" = "auto" ]]; then
+        extra_cmd=${extra_cmd}" --static_attention_dtype ${attention_dtype}"
+    fi
+
+
     python3 main.py \
         --model ${input_model} \
         --iters ${iters} \