Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ Notes:
Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%.

> Note: You can also enable static quantization for KV cache by adding `--static_kv_dtype fp8` argument to `quantize.py`, or `--static_kv_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`.
>
> You can also enable static quantization for attention by adding `--static_attention_dtype fp8` argument to `quantize.py`, or `--static_attention_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`. When enabled, it automatically sets KV cache dtype to fp8 as well.

#### Llama 3.1 8B MXFP8

Expand Down Expand Up @@ -210,8 +212,10 @@ CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8

The script automatically:
- Detects available GPUs from `CUDA_VISIBLE_DEVICES` and sets `tensor_parallel_size` accordingly
- Runs default tasks: `piqa,hellaswag,mmlu_llama,gsm8k_llama` with batch size 8
- Runs default tasks: `piqa,hellaswag,mmlu_llama,gsm8k_llama` with batch size 64
- Supports custom task selection and batch size adjustment
- Handles special tasks like `mmlu_llama`, `gsm8k_llama` (with chat template) and `longbench` (with extended context length) automatically
- For longbench dataset evaluation, use the `--tasks=longbench` parameter


### NVFP4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,13 @@ def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=No
choices=["fp8", "float8_e4m3fn"],
help="Data type for static quantize key and value.",
)
parser.add_argument(
"--static_attention_dtype",
default=None,
type=str,
choices=["fp8", "float8_e4m3fn"],
help="Data type for static quantize key and value.",
)
parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model")
parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file")
parser.add_argument("--iters", default=200, type=int, help="iters for autoround.")
Expand Down Expand Up @@ -256,6 +263,7 @@ def load_recipe_results(file_path):
options=args.options,
shared_layers=args.shared_layers,
static_kv_dtype=args.static_kv_dtype,
static_attention_dtype=args.static_attention_dtype,
enable_torch_compile=args.enable_torch_compile,
low_gpu_mem_usage=args.low_gpu_mem_usage,
export_format=args.export_format,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
transformers==4.57.3
torch==2.9.0
torchvision==0.24.0
lm_eval==0.4.9.2
lm_eval>=0.4.9.2
datasets==4.4.2
auto-round==0.9.3
neural-compressor-pt>=3.7
jieba
fuzzywuzzy
rouge
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama"
BATCH_SIZE=64
GPU_MEMORY_UTILIZATION=0.8
KV_CACHE_DTYPE="auto"
ATTN_DTYPE="auto"

while [[ $# -gt 0 ]]; do
case $1 in
Expand All @@ -30,6 +31,10 @@ while [[ $# -gt 0 ]]; do
KV_CACHE_DTYPE="${1#*=}"
shift
;;
--static_attention_dtype=*)
ATTN_DTYPE="${1#*=}"
shift
;;
*)
echo "Unknown parameter: $1"
exit 1
Expand All @@ -44,6 +49,14 @@ if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then
echo "Using FP8 for KV cache"
fi

# for fp8 attention cache
if [[ "$ATTN_DTYPE" == "fp8" ]]; then
export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=0
export VLLM_ATTENTION_BACKEND="FLASHINFER"
KV_CACHE_DTYPE="fp8"
echo "Using FP8 Attention"
fi

# Validate required parameters
if [[ -z "$MODEL_PATH" ]]; then
echo "Usage: bash run_benchmark.sh --model_path=<path_to_quantized_model> [--tasks=<tasks>] [--batch_size=<size>]"
Expand Down Expand Up @@ -103,10 +116,11 @@ run_evaluation() {
}


# Check if tasks contain gsm8k_llama or mmlu_llama
# Check if tasks contain gsm8k_llama, mmlu_llama, or longbench
NEED_SPLIT=false
OTHER_TASKS="$TASKS"
SPECIAL_TASKS=""
LONGBENCH_TASK=""

if [[ "$TASKS" == *"gsm8k_llama"* ]]; then
SPECIAL_TASKS="gsm8k_llama"
Expand All @@ -122,26 +136,24 @@ if [[ "$TASKS" == *"mmlu_llama"* ]]; then
OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*mmlu_llama,*//' | sed 's/^,//' | sed 's/,$//')
NEED_SPLIT=true
fi
if [[ "$TASKS" == *"longbench"* ]]; then
LONGBENCH_TASK="longbench"
OTHER_TASKS=$(echo "$OTHER_TASKS" | sed 's/,*longbench,*//' | sed 's/^,//' | sed 's/,$//')
NEED_SPLIT=true
fi

if [[ "$NEED_SPLIT" == true ]]; then
if [[ -n "$OTHER_TASKS" ]]; then
echo "Running general tasks"
run_evaluation "$OTHER_TASKS" true ""
if [[ $? -eq 0 ]]; then
IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS"
for special_task in "${SPECIAL_ARRAY[@]}"; do
echo "Running $special_task with chat template"
run_evaluation "$special_task" true "--apply_chat_template --fewshot_as_multiturn"
if [[ $? -ne 0 ]]; then
echo "Benchmark failed on $special_task!"
exit 1
fi
done
else
if [[ $? -ne 0 ]]; then
echo "Skipping special tasks due to previous failure"
exit 1
fi
else
fi

# Run special tasks (gsm8k_llama, mmlu_llama)
if [[ -n "$SPECIAL_TASKS" ]]; then
IFS=',' read -ra SPECIAL_ARRAY <<< "$SPECIAL_TASKS"
for special_task in "${SPECIAL_ARRAY[@]}"; do
echo "Running $special_task with chat template"
Expand All @@ -152,6 +164,26 @@ if [[ "$NEED_SPLIT" == true ]]; then
fi
done
fi

# Run longbench task with special configuration
if [[ -n "$LONGBENCH_TASK" ]]; then
echo "Running longbench with special configuration"
local longbench_cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",trust_remote_code=True,dtype=bfloat16,max_model_len=66000,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,enable_prefix_caching=False --tasks longbench --seed 42 --batch_size $BATCH_SIZE --apply_chat_template --gen_kwargs '{\"temperature\":0.0}'"
echo "Executing command: $longbench_cmd"

lm_eval --model vllm \
--model_args pretrained="$MODEL_PATH",trust_remote_code=True,dtype=bfloat16,max_model_len=66000,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,enable_prefix_caching=False \
--tasks longbench \
--seed 42 \
--batch_size $BATCH_SIZE \
--apply_chat_template \
--gen_kwargs '{"temperature":0.0}'

if [[ $? -ne 0 ]]; then
echo "Benchmark failed on longbench!"
exit 1
fi
fi
else
run_evaluation "$TASKS" true ""
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

# Parse command line arguments
KV_CACHE_DTYPE="auto"
STATIC_ATTENTION_DTYPE="auto"
while [[ $# -gt 0 ]]; do
case $1 in
--topology=*)
Expand All @@ -26,6 +27,10 @@ while [[ $# -gt 0 ]]; do
KV_CACHE_DTYPE="${1#*=}"
shift
;;
--static_attention_dtype=*)
STATIC_ATTENTION_DTYPE="${1#*=}"
shift
;;
*)
echo "Unknown parameter: $1"
exit 1
Expand All @@ -48,10 +53,12 @@ echo " Input Model: $INPUT_MODEL"
echo " Output Model: $OUTPUT_MODEL"

# Set common parameters
if [ "$KV_CACHE_DTYPE" = "auto" ]; then
COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round"
else
COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round --static_kv_dtype $KV_CACHE_DTYPE"
COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round"
if [ "$KV_CACHE_DTYPE" != "auto" ]; then
COMMON_ARGS="$COMMON_ARGS --static_kv_dtype $KV_CACHE_DTYPE"
fi
if [ "$STATIC_ATTENTION_DTYPE" != "auto" ]; then
COMMON_ARGS="$COMMON_ARGS --static_attention_dtype $STATIC_ATTENTION_DTYPE"
fi

case "$TOPOLOGY" in
Expand Down