From 830b53d13adc633a48c498b5c1c6078187c94a8f Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 18 Mar 2026 12:09:20 -0400
Subject: [PATCH 1/3] move examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_attention/README.md                       | 1 +
 .../quantization_attention}/llama3_attention.py                 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 examples/quantization_attention/README.md
 rename {experimental/attention => examples/quantization_attention}/llama3_attention.py (100%)

diff --git a/examples/quantization_attention/README.md b/examples/quantization_attention/README.md
new file mode 100644
index 0000000000..2fcfda5455
--- /dev/null
+++ b/examples/quantization_attention/README.md
@@ -0,0 +1 @@
+TODO, eval from eldar
diff --git a/experimental/attention/llama3_attention.py b/examples/quantization_attention/llama3_attention.py
similarity index 100%
rename from experimental/attention/llama3_attention.py
rename to examples/quantization_attention/llama3_attention.py
index b8fe2a5d77..c518c1d2cf 100644
--- a/experimental/attention/llama3_attention.py
+++ b/examples/quantization_attention/llama3_attention.py
@@ -1,10 +1,10 @@
+from compressed_tensors.offload import dispatch_model
 from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from compressed_tensors.offload import dispatch_model
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

From 026fd1499b25b8b3062a9ce8ae2159b48ea8ace2 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 18 Mar 2026 16:29:43 -0400
Subject: [PATCH 2/3] update readmes

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_attention/README.md | 22 +++++++++++++++++++++-
 experimental/attention/README.md          | 20 +-------------------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/examples/quantization_attention/README.md b/examples/quantization_attention/README.md
index 2fcfda5455..c32cbc6227 100644
--- a/examples/quantization_attention/README.md
+++ b/examples/quantization_attention/README.md
@@ -1 +1,21 @@
-TODO, eval from eldar
+# Attention Quantization in LLM Compressor #
+LLM Compressor supports applying static attention quantization to models
+
+## FP8 Attention Example ##
+For an example applying attention quantization, see [llama3_attention.py](/experimental/attention/llama3_attention.py).
+
+```python
+recipe = QuantizationModifier(
+    config_groups={
+        "attention": QuantizationScheme(
+            targets=["LlamaAttention"],
+            input_activations=QuantizationArgs(
+                num_bits=8, type="float", strategy="attn_head"
+            ),
+        )
+    }
+)
+```
+
+Accuracy should be almost identical to the base model for FP8 attention.
+Note that attention quantization also implicitly applies kv cache quantization with the same quantization arguments.
diff --git a/experimental/attention/README.md b/experimental/attention/README.md
index ca96326375..3e064a79b1 100644
--- a/experimental/attention/README.md
+++ b/experimental/attention/README.md
@@ -1,23 +1,5 @@
 # Attention Quantization in LLM Compressor #
-LLM Compressor supports applying static attention quantization to models. Please note that attention quantization support in vLLM is still ongoing and is not fully supported as of this writing.
-
-## FP8 Attention Example ##
-For an example applying attention quantization, see [llama3_attention.py](/experimental/attention/llama3_attention.py).
-
-```python
-recipe = QuantizationModifier(
-    config_groups={
-        "attention": QuantizationScheme(
-            targets=["LlamaAttention"],
-            input_activations=QuantizationArgs(
-                num_bits=8, type="float", strategy="attn_head"
-            ),
-        )
-    }
-)
-```
-
-Note that attention quantization also implicitly applies kv cache quantization with the same quantization arguments.
+LLM Compressor supports applying static attention quantization to models. Please note that NVFP4 attention quantization and R3 support in vLLM is still ongoing and is not fully supported as of this writing.
 
 ## NVFP4 Attention + R3 Example ##
 Attention quantization can be improved using the R3 transform, as described by [SpinQuant](https://arxiv.org/abs/2405.16406). This transform reduces the presence of outliers in the attention activation distribution, thereby improving accurcy recovery.

From 69e070bd6ab512da9994fcff996420722d12862e Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 19 Mar 2026 10:39:20 -0400
Subject: [PATCH 3/3] Update examples/quantization_attention/README.md

Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantization_attention/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/quantization_attention/README.md b/examples/quantization_attention/README.md
index c32cbc6227..e793916f28 100644
--- a/examples/quantization_attention/README.md
+++ b/examples/quantization_attention/README.md
@@ -2,7 +2,7 @@
 LLM Compressor supports applying static attention quantization to models
 
 ## FP8 Attention Example ##
-For an example applying attention quantization, see [llama3_attention.py](/experimental/attention/llama3_attention.py).
+For an example applying attention quantization, see [llama3_attention.py](/examples/quantization_attention/llama3_attention.py).
 
 ```python
 recipe = QuantizationModifier(