diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
index 2faac68dabe..0e6db647413 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -79,6 +79,8 @@ Notes:
 Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%.
 
 > Note: You can also enable static quantization for KV cache by adding `--static_kv_dtype fp8` argument to `quantize.py`， or `--static_kv_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`.
+>
+> You can also enable static quantization for attention by adding `--static_attention_dtype fp8` argument to `quantize.py`， or `--static_attention_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`. When enabled, it automatically sets KV cache dtype to fp8 as well.
 
 #### Llama 3.1 8B MXFP8
 
@@ -210,8 +212,10 @@ CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8
 
 The script automatically:
 - Detects available GPUs from `CUDA_VISIBLE_DEVICES` and sets `tensor_parallel_size` accordingly
-- Runs default tasks: `piqa,hellaswag,mmlu_llama,gsm8k_llama` with batch size 8
+- Runs default tasks: `piqa,hellaswag,mmlu_llama,gsm8k_llama` with batch size 64
 - Supports custom task selection and batch size adjustment
+- Handles special tasks like `mmlu_llama`, `gsm8k_llama` (with chat template) and `longbench` (with extended context length) automatically
+- For longbench dataset evaluation, use the `--tasks=longbench` parameter
 
 
 ### NVFP4
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
index feff13f9d20..158cde59ddd 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
@@ -169,6 +169,13 @@ def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=No
         choices=["fp8", "float8_e4m3fn"],
         help="Data type for static quantize key and value.",
     )
+    parser.add_argument(
+        "--static_attention_dtype",
+        default=None,
+        type=str,
+        choices=["fp8", "float8_e4m3fn"],
+        help="Data type for static quantize key and value.",
+    )
     parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model")
     parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file")
     parser.add_argument("--iters", default=200, type=int, help="iters for autoround.")
@@ -256,6 +263,7 @@ def load_recipe_results(file_path):
             options=args.options,
             shared_layers=args.shared_layers,
             static_kv_dtype=args.static_kv_dtype,
+            static_attention_dtype=args.static_attention_dtype,
             enable_torch_compile=args.enable_torch_compile,
             low_gpu_mem_usage=args.low_gpu_mem_usage,
             export_format=args.export_format,
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt
index 842aeea062d..8f2564787b4 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt
@@ -1,7 +1,10 @@
 transformers==4.57.3
 torch==2.9.0
 torchvision==0.24.0
-lm_eval==0.4.9.2
+lm_eval>=0.4.9.2
 datasets==4.4.2
 auto-round==0.9.3
 neural-compressor-pt>=3.7
+jieba
+fuzzywuzzy
+rouge
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index 6aaa3a4da56..6127f968733 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -7,6 +7,7 @@ TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama"
 BATCH_SIZE=64
 GPU_MEMORY_UTILIZATION=0.8
 KV_CACHE_DTYPE="auto"
+ATTN_DTYPE="auto"
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -30,6 +31,10 @@ while [[ $# -gt 0 ]]; do
             KV_CACHE_DTYPE="${1#*=}"
             shift
             ;;
+        --static_attention_dtype=*)
+            ATTN_DTYPE="${1#*=}"
+            shift
+            ;;
         *)
             echo "Unknown parameter: $1"
             exit 1
@@ -44,6 +49,14 @@ if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then
     echo "Using FP8 for KV cache"
 fi
 
+# for fp8 attention cache
+if [[ "$ATTN_DTYPE" == "fp8" ]]; then
+    export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=0
+    export VLLM_ATTENTION_BACKEND="FLASHINFER"
+    KV_CACHE_DTYPE="fp8"
+    echo "Using FP8 Attention"
+fi
+
 # Validate required parameters
 if [[ -z "$MODEL_PATH" ]]; then
     echo "Usage: bash run_benchmark.sh --model_path=<path_to_quantized_model> [--tasks=<tasks>] [--batch_size=<size>]"
@@ -103,10 +116,11 @@ run_evaluation() {
 }
 
 
-# Check if tasks contain gsm8k_llama or mmlu_llama
+# Check if tasks contain gsm8k_llama, mmlu_llama, or longbench
 NEED_SPLIT=false
 OTHER_TASKS="$TASKS"
 SPECIAL_TASKS=""
+LONGBENCH_TASK=""
 
 if [[ "$TASKS" == *"gsm8k_llama"* ]]; then
     SPECIAL_TASKS="gsm8k_llama"
@@ -122,26 +136,24 @@ if [[ "$TASKS" == *"mmlu_llama"* ]]; then
     OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*mmlu_llama,*//' | sed 's/^,//' | sed 's/,$//')
     NEED_SPLIT=true
 fi
+if [[ "$TASKS" == *"longbench"* ]]; then
+    LONGBENCH_TASK="longbench"
+    OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*longbench,*//' | sed 's/^,//' | sed 's/,$//')
+    NEED_SPLIT=true
+fi
 
 if [[ "$NEED_SPLIT" == true ]]; then
     if [[ -n "$OTHER_TASKS" ]]; then
         echo "Running general tasks"
         run_evaluation "$OTHER_TASKS" true ""
-        if [[ $? -eq 0 ]]; then
-            IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS"
-            for special_task in "${SPECIAL_ARRAY[@]}"; do
-                echo "Running $special_task with chat template"
-                run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn"
-                if [[ $? -ne 0 ]]; then
-                    echo "Benchmark failed on $special_task!"
-                    exit 1
-                fi
-            done
-        else
+        if [[ $? -ne 0 ]]; then
             echo "Skipping special tasks due to previous failure"
             exit 1
         fi
-    else
+    fi
+    
+    # Run special tasks (gsm8k_llama, mmlu_llama)
+    if [[ -n "$SPECIAL_TASKS" ]]; then
         IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS"
         for special_task in "${SPECIAL_ARRAY[@]}"; do
             echo "Running $special_task with chat template"
@@ -152,6 +164,26 @@ if [[ "$NEED_SPLIT" == true ]]; then
             fi
         done
     fi
+    
+    # Run longbench task with special configuration
+    if [[ -n "$LONGBENCH_TASK" ]]; then
+        echo "Running longbench with special configuration"
+        local longbench_cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",trust_remote_code=True,dtype=bfloat16,max_model_len=66000,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,enable_prefix_caching=False --tasks longbench --seed 42 --batch_size $BATCH_SIZE --apply_chat_template --gen_kwargs '{\"temperature\":0.0}'"
+        echo "Executing command: $longbench_cmd"
+        
+        lm_eval --model vllm \
+            --model_args pretrained="$MODEL_PATH",trust_remote_code=True,dtype=bfloat16,max_model_len=66000,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,enable_prefix_caching=False \
+            --tasks longbench \
+            --seed 42 \
+            --batch_size $BATCH_SIZE \
+            --apply_chat_template \
+            --gen_kwargs '{"temperature":0.0}'
+        
+        if [[ $? -ne 0 ]]; then
+            echo "Benchmark failed on longbench!"
+            exit 1
+        fi
+    fi
 else
     run_evaluation "$TASKS" true ""
 fi
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index a53443831f0..9a18c274716 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -4,6 +4,7 @@
 
 # Parse command line arguments
 KV_CACHE_DTYPE="auto"
+STATIC_ATTENTION_DTYPE="auto"
 while [[ $# -gt 0 ]]; do
     case $1 in
         --topology=*)
@@ -26,6 +27,10 @@ while [[ $# -gt 0 ]]; do
             KV_CACHE_DTYPE="${1#*=}"
             shift
             ;;
+        --static_attention_dtype=*)
+            STATIC_ATTENTION_DTYPE="${1#*=}"
+            shift
+            ;;
         *)
             echo "Unknown parameter: $1"
             exit 1
@@ -48,10 +53,12 @@ echo "  Input Model: $INPUT_MODEL"
 echo "  Output Model: $OUTPUT_MODEL"
 
 # Set common parameters
-if [ "$KV_CACHE_DTYPE" = "auto" ]; then
-    COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round"
-else
-    COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round --static_kv_dtype $KV_CACHE_DTYPE"
+COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round"
+if [ "$KV_CACHE_DTYPE" != "auto" ]; then
+    COMMON_ARGS="$COMMON_ARGS --static_kv_dtype $KV_CACHE_DTYPE"
+fi
+if [ "$STATIC_ATTENTION_DTYPE" != "auto" ]; then
+    COMMON_ARGS="$COMMON_ARGS --static_attention_dtype $STATIC_ATTENTION_DTYPE"
 fi
 
 case "$TOPOLOGY" in