From 830b53d13adc633a48c498b5c1c6078187c94a8f Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 18 Mar 2026 12:09:20 -0400 Subject: [PATCH 1/3] move examples Signed-off-by: Kyle Sayers --- examples/quantization_attention/README.md | 1 + .../quantization_attention}/llama3_attention.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 examples/quantization_attention/README.md rename {experimental/attention => examples/quantization_attention}/llama3_attention.py (100%) diff --git a/examples/quantization_attention/README.md b/examples/quantization_attention/README.md new file mode 100644 index 0000000000..2fcfda5455 --- /dev/null +++ b/examples/quantization_attention/README.md @@ -0,0 +1 @@ +TODO, eval from eldar diff --git a/experimental/attention/llama3_attention.py b/examples/quantization_attention/llama3_attention.py similarity index 100% rename from experimental/attention/llama3_attention.py rename to examples/quantization_attention/llama3_attention.py index b8fe2a5d77..c518c1d2cf 100644 --- a/experimental/attention/llama3_attention.py +++ b/examples/quantization_attention/llama3_attention.py @@ -1,10 +1,10 @@ +from compressed_tensors.offload import dispatch_model from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -from compressed_tensors.offload import dispatch_model # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" From 026fd1499b25b8b3062a9ce8ae2159b48ea8ace2 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 18 Mar 2026 16:29:43 -0400 Subject: [PATCH 2/3] update readmes Signed-off-by: Kyle Sayers --- examples/quantization_attention/README.md | 22 +++++++++++++++++++++- experimental/attention/README.md | 20 +------------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/examples/quantization_attention/README.md b/examples/quantization_attention/README.md index 2fcfda5455..c32cbc6227 100644 --- a/examples/quantization_attention/README.md +++ b/examples/quantization_attention/README.md @@ -1 +1,21 @@ -TODO, eval from eldar +# Attention Quantization in LLM Compressor # +LLM Compressor supports applying static attention quantization to models + +## FP8 Attention Example ## +For an example applying attention quantization, see [llama3_attention.py](/experimental/attention/llama3_attention.py). + +```python +recipe = QuantizationModifier( + config_groups={ + "attention": QuantizationScheme( + targets=["LlamaAttention"], + input_activations=QuantizationArgs( + num_bits=8, type="float", strategy="attn_head" + ), + ) + } +) +``` + +Accuracy should be almost identical to the base model for FP8 attention. +Note that attention quantization also implicitly applies kv cache quantization with the same quantization arguments. diff --git a/experimental/attention/README.md b/experimental/attention/README.md index ca96326375..3e064a79b1 100644 --- a/experimental/attention/README.md +++ b/experimental/attention/README.md @@ -1,23 +1,5 @@ # Attention Quantization in LLM Compressor # -LLM Compressor supports applying static attention quantization to models. Please note that attention quantization support in vLLM is still ongoing and is not fully supported as of this writing. - -## FP8 Attention Example ## -For an example applying attention quantization, see [llama3_attention.py](/experimental/attention/llama3_attention.py). - -```python -recipe = QuantizationModifier( - config_groups={ - "attention": QuantizationScheme( - targets=["LlamaAttention"], - input_activations=QuantizationArgs( - num_bits=8, type="float", strategy="attn_head" - ), - ) - } -) -``` - -Note that attention quantization also implicitly applies kv cache quantization with the same quantization arguments. +LLM Compressor supports applying static attention quantization to models. Please note that NVFP4 attention quantization and R3 support in vLLM is still ongoing and is not fully supported as of this writing. ## NVFP4 Attention + R3 Example ## Attention quantization can be improved using the R3 transform, as described by [SpinQuant](https://arxiv.org/abs/2405.16406). This transform reduces the presence of outliers in the attention activation distribution, thereby improving accurcy recovery. From 69e070bd6ab512da9994fcff996420722d12862e Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 19 Mar 2026 10:39:20 -0400 Subject: [PATCH 3/3] Update examples/quantization_attention/README.md Co-authored-by: Brian Dellabetta Signed-off-by: Kyle Sayers --- examples/quantization_attention/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantization_attention/README.md b/examples/quantization_attention/README.md index c32cbc6227..e793916f28 100644 --- a/examples/quantization_attention/README.md +++ b/examples/quantization_attention/README.md @@ -2,7 +2,7 @@ LLM Compressor supports applying static attention quantization to models ## FP8 Attention Example ## -For an example applying attention quantization, see [llama3_attention.py](/experimental/attention/llama3_attention.py). +For an example applying attention quantization, see [llama3_attention.py](/examples/quantization_attention/llama3_attention.py). ```python recipe = QuantizationModifier(