Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pr-code-sync.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ jobs:
else
echo "SHAs are not aligned. Internal SHA: $internal_sha, External SHA: $external_sha"
fi
git clone --depth=1 -b $EXTERNAL_BRANCH https://${{ secrets.REPO_SYNC_USER_NAME }}:${{ secrets.REPO_SYNC_SSH_PRIVATE_KEY }}@github.com/${EXTERNAL_REPO}.git neural-compressor

git clone --depth=1 -b $EXTERNAL_BRANCH https://${{ secrets.REPO_SYNC_USER_NAME }}:${{ secrets.REPO_SYNC_SSH_PRIVATE_KEY }}@github.com/${EXTERNAL_REPO}.git neural-compressor
cd neural-compressor
git remote add innersource https://${{ secrets.REPO_SYNC_USER_NAME }}:${{ secrets.REPO_SYNC_SSH_PRIVATE_KEY }}@github.com/${INNERSOURCE_REPO}.git
git fetch innersource ${INNERSOURCE_BRANCH} --depth=100
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@

## Support Matrix

| Model Family | MXFP4 | MXFP8 |
| ----------------------- | ----- | ----- |
| Qwen/Qwen3-235B-A22B | ✅ | ✅ |
| deepseek-ai/DeepSeek-R1 | ✅ | ✅ |

### Quantize Model
- Export model path
```bash
export QWEN_MODEL=Qwen/Qwen3-235B-A22B
export DS_MODEL=deepseek-ai/DeepSeek-R1
```

- MXFP8
```bash
python quantize.py --model $QWEN_MODEL -t qwen_mxfp8 --use_autoround_format --output_dir ./qmodels
python quantize.py --model $DS_MODEL -t ds_mxfp8 --use_autoround_format ----output_dir ./qmodels
```

- MXFP4
```bash
python quantize.py --model $QWEN_MODEL -t qwen_mxfp4 --use_autoround_format --output_dir ./qmodels
python quantize.py --model $DS_MODEL -t qwen_mxfp4 --use_autoround_format --output_dir ./qmodels
```

## Evaluation
```bash
git clone https://github.com/yiliu30/vllm-fork/tree/
cd vllm-fork
git checkout fused-moe-ar
VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
```

### Prompt Tests

Usage:
```bash
bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path]
```

- MXFP8
```bash
bash ./run_generate.sh -s mxfp8 -tp 4 -m /path/to/qwen_mxfp8
bash ./run_generate.sh -s mxfp8 -tp 8 -m /path/to/ds_mxfp8
```
- MXFP4
```bash
bash ./run_generate.sh -s mxfp4 -tp 4 -m /path/to/qwen_mxfp
bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4
```
### Evaluation


Usage:
```bash
bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
```
```bash
bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp8
bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8
bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8
bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8

```
- MXFP4
```bash
bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp4
bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4
bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4
bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
```




Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copied from https://github.com/vllm-project/vllm/

try:
from auto_round_extension.vllm_ext import apply as apply_auto_round_extension
apply_auto_round_extension()
except ImportError:
print("auto_round_extension.vllm_ext not found, proceeding without auto-round extension.")

from vllm import LLM, EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser



def create_parser():
parser = FlexibleArgumentParser()
# Add engine args
EngineArgs.add_cli_args(parser)
parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
# Add sampling params
sampling_group = parser.add_argument_group("Sampling parameters")
sampling_group.add_argument("--max-tokens", type=int)
sampling_group.add_argument("--temperature", type=float)
sampling_group.add_argument("--top-p", type=float)
sampling_group.add_argument("--top-k", type=int)

return parser


def main(args: dict):
# Pop arguments not used by LLM
max_tokens = args.pop("max_tokens")
temperature = args.pop("temperature")
top_p = args.pop("top_p")
top_k = args.pop("top_k")

# Create an LLM
llm = LLM(**args)

# Create a sampling params object
sampling_params = llm.get_default_sampling_params()
if max_tokens is not None:
sampling_params.max_tokens = max_tokens
if temperature is not None:
sampling_params.temperature = temperature
if top_p is not None:
sampling_params.top_p = top_p
if top_k is not None:
sampling_params.top_k = top_k

# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)


if __name__ == "__main__":
parser = create_parser()
args: dict = vars(parser.parse_args())
main(args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
import logging
from auto_round import AutoRound

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


topologies_config = {
"ds_mxfp8": {
"scheme": "MXFP8",
"fp_layers": "lm_head",
"iters": 0,
},
"ds_mxfp4": {
"scheme": "MXFP4",
"fp_layers": "lm_head,self_attn",
"iters": 0,
},
"qwen_mxfp8": {
"scheme": "MXFP8",
"fp_layers": "lm_head,mlp.gate",
"iters": 0,
},
"qwen_mxfp4": {
"scheme": "MXFP4",
"fp_layers": "lm_head,mlp.gate,self_attn",
"iters": 0, # TODO: set to 200 before merge
},
}


def quant_model_ar(args):
config = topologies_config[args.t]

logger.info(f"Using fp_layers: {config['fp_layers']}")
autoround = AutoRound(
model=args.model,
scheme=config["scheme"],
enable_torch_compile=args.enable_torch_compile,
iters=config["iters"],
fp_layers=config["fp_layers"],
)
logger.info(f"Save quantized model to {args.output_dir}")
format_type = "auto_round" if args.use_autoround_format else "llm_compressor"
autoround.quantize_and_save(
format=format_type,
output_dir=f"{args.output_dir}/quantized_model_{args.t}",
)


def get_model_and_tokenizer(model_name):
# Load model and tokenizer
fp32_model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="cpu",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
)
return fp32_model, tokenizer


def quant_model(args):
from neural_compressor.torch.quantization import (
AutoRoundConfig,
convert,
prepare,
)

config = topologies_config[args.t]
export_format = "auto_round" if args.use_autoround_format else "llm_compressor"
output_dir = f"{args.output_dir}/quantized_model_{args.t}"
fp32_model, tokenizer = get_model_and_tokenizer(args.model)
quant_config = AutoRoundConfig(
tokenizer=tokenizer,
# nsamples=32,
# seqlen=10,
# iters=1,
# amp=False,
# scale_dtype="fp16",
scheme=config["scheme"],
enable_torch_compile=args.enable_torch_compile,
iters=config["iters"],
fp_layers=config["fp_layers"],
export_format=export_format,
output_dir=output_dir,
)

# quantizer execute
model = prepare(model=fp32_model, quant_config=quant_config)
inc_model = convert(model)
logger.info(f"Quantized model saved to {output_dir}")


if __name__ == "__main__":
import argparse

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Select a quantization scheme.")
parser.add_argument(
"--model",
type=str,
help="Path to the pre-trained model or model identifier from Hugging Face Hub.",
)
parser.add_argument(
"-t",
type=str,
choices=topologies_config.keys(),
default="qwen_mxfp4",
help="Quantization scheme to use. Available options: " + ", ".join(topologies_config.keys()),
)

parser.add_argument(
"--enable_torch_compile",
action="store_true",
help="Enable torch compile for the model.",
)
parser.add_argument(
"--use_autoround_format",
action="store_true",
help="Use AutoRound format for saving the quantized model.",
)

parser.add_argument(
"--skip_attn",
action="store_true",
help="Skip quantize attention layers.",
)
parser.add_argument(
"--iters",
type=int,
default=0,
help="Number of iterations for quantization.",
)
parser.add_argument(
"--output_dir",
type=str,
default="./",
help="Directory to save the quantized model.",
)

args = parser.parse_args()

quant_model(args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@


#!/bin/bash
# Check if a model name is passed as an argument, otherwise use the default model path
if [ -z "$1" ]; then
# model_path="Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound"
# model_path="/storage/yiliu7/quantized_model_ds_mxfp8"
model_path="/storage/yiliu7/quantized_model_ds_mxfp4"
model_path="/storage/yiliu7/quantized_model_ds_mxfp4"
model_path="/storage/yiliu7/quantized_model_ds_mxfp8"
# model_path="qmodels/quantized_model_ds_mxfp8"
# model_path="./small-qmodels/quantized_model_qwen_mxfp8/"
# model_path="/storage/yiliu7/quantized_model_qwen_mxfp4"
# model_path="/storage/yiliu7/quantized_model_qwen_mxfp8"
else
model_path="$1"
fi

tp_size=8
model_name=$(basename ${model_path})
output_dir="${model_name}-tp${tp_size}-gsm8k-acc"
# task_name="gsm8k"
# batch_size=256
batch_size=512
task_name="piqa,hellaswag,mmlu"
# task_name="mmlu_high_school_biology"

echo "Evaluating model: ${model_path} on task: ${task_name}, output dir: ${output_dir}"
# VLLM_ATTENTION_BACKEND=TRITON_ATTN \
mkdir -p ${output_dir}
# VLLM_ATTENTION_BACKEND=FLASHINFER \


# -MXFP4 Evaluation
# /storage/yiliu7/quantized_model_qwen_mxfp4 4x200
# VLLM_AR_MXFP4_MODULAR_MOE=1 \
# VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 \
# VLLM_ENABLE_STATIC_MOE=0 \
# VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 \
# VLLM_USE_DEEP_GEMM=0 \
# VLLM_ENABLE_AR_EXT=1 \
# VLLM_ENABLE_V1_MULTIPROCESSING=1 \
# lm_eval --model vllm \
# --model_args "pretrained=${model_path},tensor_parallel_size=${tp_size},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,enable_expert_parallel=True" \
# --tasks $task_name \
# --batch_size 16 \
# --limit 256 \
# --log_samples \
# --seed 42 \
# --output_path ${output_dir} \
# --show_config 2>&1 | tee ${output_dir}/log.txt

# -MXFP8 Evaluation
# !!! Please set below knobs strictly for MXFP8 model evaluation !!!
# /storage/yiliu7/quantized_model_qwen_mxfp8 4x200
VLLM_ENABLE_AR_EXT=1 \
VLLM_AR_MXFP4_MODULAR_MOE=0 \
VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 \
VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 \
VLLM_ENABLE_STATIC_MOE=0 \
VLLM_USE_DEEP_GEMM=0 \
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
lm_eval --model vllm \
--model_args "pretrained=${model_path},tensor_parallel_size=${tp_size},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False" \
--tasks $task_name \
--batch_size $batch_size \
--log_samples \
--seed 42 \
--output_path ${output_dir} \
--show_config 2>&1 | tee ${output_dir}/log.txt

Loading