diff --git a/examples/autoround/quantization_w4a4_fp4/README.md b/examples/autoround/quantization_w4a4_fp4/README.md old mode 100755 new mode 100644 index d0c75adc4f..5aabf9906b --- a/examples/autoround/quantization_w4a4_fp4/README.md +++ b/examples/autoround/quantization_w4a4_fp4/README.md @@ -16,9 +16,7 @@ pip install -e . ## Quickstart -The example includes end-to-end scripts for applying the AutoRound quantization algorithm. - -### Llama 3.1 Example +The example includes an end-to-end script for applying the AutoRound quantization algorithm. ```bash python3 llama3.1_example.py @@ -26,7 +24,7 @@ python3 llama3.1_example.py The resulting model `Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound` is ready to be loaded into vLLM. -#### Evaluate Accuracy +### Evaluate Accuracy With the model created, we can now load and run in vLLM (after installing). @@ -48,26 +46,26 @@ lm_eval --model vllm \ --batch_size 'auto' ``` -##### meta-llama/Meta-Llama-3.1-8B-Instruct +#### meta-llama/Meta-Llama-3.1-8B-Instruct |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7710|± |0.0116| | | |strict-match | 5|exact_match|↑ |0.7043|± |0.0126| -##### Meta-Llama-3.1-8B-Instruct-NVFP4 (QuantizationModifier) +#### Meta-Llama-3.1-8B-Instruct-NVFP4 (QuantizationModifier) |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7248|± |0.0123| | | |strict-match | 5|exact_match|↑ |0.6611|± |0.0130| -##### Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=0) +#### Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=0) |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7362|± |0.0121| | | |strict-match | 5|exact_match|↑ |0.6702|± |0.0129| -##### Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=200) +#### Meta-Llama-3.1-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=200) |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7210|± |0.0124| @@ -75,41 +73,6 @@ lm_eval --model vllm \ > Note: quantized model accuracy may vary slightly due to nondeterminism. -### Qwen3-VL Example - -```bash -python3 qwen3_vl_example.py -``` - -The resulting model `Qwen3-VL-8B-Instruct-NVFP4-AutoRound` is ready to be loaded into vLLM. - -#### Evaluate Accuracy - -Run the following to test accuracy on GSM-8K: - -```bash -lm_eval --model vllm-vlm \ - --model_args pretrained="./Qwen3-VL-8B-Instruct-NVFP4-AutoRound",add_bos_token=true \ - --tasks gsm8k \ - --num_fewshot 5 \ - --batch_size 'auto' -``` - -##### Qwen3-VL-8B-Instruct (Baseline) -|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| -|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| -|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.8628|± |0.0095| -| | |strict-match | 5|exact_match|↑ |0.8453|± |0.0100| - - -##### Qwen3-VL-8B-Instruct-NVFP4-AutoRound (AutoRoundModifier, iters=200) -|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| -|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| -|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.8415|± |0.0101| -| | |strict-match | 5|exact_match|↑ |0.8408|± |0.0101| - -> Note: quantized model accuracy may vary slightly due to nondeterminism. - ### Questions or Feature Request? Please open up an issue on [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor) or [intel/auto-round](https://github.com/intel/auto-round). diff --git a/examples/autoround/quantization_w4a4_fp4/qwen3_vl_example.py b/examples/autoround/quantization_w4a4_fp4/qwen3_vl_example.py deleted file mode 100755 index d98efaaad2..0000000000 --- a/examples/autoround/quantization_w4a4_fp4/qwen3_vl_example.py +++ /dev/null @@ -1,60 +0,0 @@ -from auto_round.calib_dataset import get_dataset -from transformers import AutoProcessor, Qwen3VLForConditionalGeneration - -from llmcompressor import oneshot -from llmcompressor.modifiers.autoround import AutoRoundModifier -from llmcompressor.utils import dispatch_for_generation - -# Load model. -MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" -model = Qwen3VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") -processor = AutoProcessor.from_pretrained(MODEL_ID) -tokenizer = processor.tokenizer - -# Select calibration dataset. -NUM_CALIBRATION_SAMPLES = 128 -MAX_SEQUENCE_LENGTH = 2048 -# Get aligned calibration dataset. - -ds = get_dataset( - tokenizer=tokenizer, - seqlen=MAX_SEQUENCE_LENGTH, - nsamples=NUM_CALIBRATION_SAMPLES, -) - - -# Configure the quantization algorithm to run. -# * quantize the weights to 4 bit with AutoRound with a group size 128 -recipe = AutoRoundModifier( - targets="Linear", - scheme="NVFP4", - ignore=["re:.*lm_head", "re:.*visual.*"], - iters=200 -) - -# Apply algorithms. -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - # disable shuffling to get slightly better mmlu score - shuffle_calibration_samples=False, -) - -print("\n\n") -print("========== SAMPLE GENERATION ==============") -dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( - model.device -) -output = model.generate(input_ids, max_new_tokens=100) -print(tokenizer.decode(output[0])) -print("==========================================\n\n") - - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4-AutoRound" -model.save_pretrained(SAVE_DIR, save_compressed=True) -processor.save_pretrained(SAVE_DIR)