From e1d1915bfa8d4c27ffb21a8a567a57868daf3c7e Mon Sep 17 00:00:00 2001
From: wangwenming <295323587@qq.com>
Date: Tue, 2 Dec 2025 19:03:53 +0800
Subject: [PATCH 1/2] feat: add importance-aware mixed-precision quantization

---
 .../quantization_importance_aware.py          | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 examples/quantization_non_uniform/quantization_importance_aware.py

diff --git a/examples/quantization_non_uniform/quantization_importance_aware.py b/examples/quantization_non_uniform/quantization_importance_aware.py
new file mode 100644
index 0000000000..69b7e1dea2
--- /dev/null
+++ b/examples/quantization_non_uniform/quantization_importance_aware.py
@@ -0,0 +1,94 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.utils import dispatch_for_generation
+
+MODEL_ID = "Qwen/Qwen3-32B"
+
+# Load model.
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Configure the importance-aware mixed-precision quantization recipe.
+# In this case, we:
+#   * Keep the first 3 layers (0,1,2) and last 3 layers (62,63,51) in full precision
+#       (not quantized) due to their high sensitivity.
+#   * Exclude lm_head from quantization to preserve output quality.
+#   * Quantize weights of specific middle layers' self-attention and MLP blocks to fp4:
+#       - Layers: 15-24, 31, 46-48, 50, 56-60
+#       - Modules: k_proj, o_proj, q_proj, v_proj, down_proj, gate_proj, up_proj
+#       - Scheme: fp4, symmetric, per-group (group_size=16), static (PTQ)
+#   * Quantize weights of other intermediate layers to fp8:
+#       - Layers: 3-14, 25-30, 32-36, 37-55, 61 (excluding already covered or ignored)
+#       - Same modules as above
+#       - Scheme: fp8, symmetric, per-channel, static (PTQ)
+#   * Additionally, dynamically quantize input activations for fp8-weighted layers:
+#       - Activations quantized to fp8, symmetric, per-token, dynamic range
+
+# Define layer groups for readability and line-length compliance
+
+fp4_group = "15|16|17|18|19|20|21|22|23|24|31|46|47|48|50|56|57|58|59|60"
+fp8_group = (
+    "7|37|11|35|3|4|5|6|8|9|10|12|13|14|25|26|27|28|29|30|"
+    "32|33|34|36|38|39|40|41|42|43|44|45|49|52|53|54|55|61"
+)
+
+recipe = f"""
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore:
+                - "lm_head"
+                - 're:.*layers\\.0\\..*'
+                - 're:.*layers\\.1\\..*'
+                - 're:.*layers\\.2\\..*'
+                - 're:.*layers\\.51\\..*'
+                - 're:.*layers\\.62\\..*'
+                - 're:.*layers\\.63\\..*'
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 4
+                        type: float
+                        strategy: tensor_group
+                        dynamic: false
+                        symmetric: true
+                        group_size: 16
+                    targets:
+                        - 're:.*layers\\.({fp4_group})\\.self_attn\\.[kqov]_proj'
+                        - 're:.*layers\\.({fp4_group})\\.mlp\\.(gate|up|down)_proj'
+                group_1:
+                    weights:
+                        num_bits: 8
+                        type: float
+                        strategy: channel
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
+                        num_bits: 8
+                        type: float
+                        strategy: token
+                        dynamic: true
+                        symmetric: true
+                    targets:
+                        - 're:.*layers\\.({fp8_group})\\.self_attn\\.[kqov]_proj'
+                        - 're:.*layers\\.({fp8_group})\\.mlp\\.(gate|up|down)_proj'
+"""
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-Importance-Aware-Mix-Quantization"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)

From 1a1a55e11270aeabf5c502db4b4e2a416bdcabd3 Mon Sep 17 00:00:00 2001
From: wangwenming <295323587@qq.com>
Date: Tue, 2 Dec 2025 19:52:39 +0800
Subject: [PATCH 2/2] feat: add importance-aware mixed-precision quantization

---
 .../quantization_importance_aware.py                      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/quantization_non_uniform/quantization_importance_aware.py b/examples/quantization_non_uniform/quantization_importance_aware.py
index 69b7e1dea2..f415aa6c1b 100644
--- a/examples/quantization_non_uniform/quantization_importance_aware.py
+++ b/examples/quantization_non_uniform/quantization_importance_aware.py
@@ -11,7 +11,7 @@
 
 # Configure the importance-aware mixed-precision quantization recipe.
 # In this case, we:
-#   * Keep the first 3 layers (0,1,2) and last 3 layers (62,63,51) in full precision
+#   * Keep the first 3 layers (0,1,2) and last 3 layers (51,62,63) in full precision
 #       (not quantized) due to their high sensitivity.
 #   * Exclude lm_head from quantization to preserve output quality.
 #   * Quantize weights of specific middle layers' self-attention and MLP blocks to fp4:
@@ -19,7 +19,7 @@
 #       - Modules: k_proj, o_proj, q_proj, v_proj, down_proj, gate_proj, up_proj
 #       - Scheme: fp4, symmetric, per-group (group_size=16), static (PTQ)
 #   * Quantize weights of other intermediate layers to fp8:
-#       - Layers: 3-14, 25-30, 32-36, 37-55, 61 (excluding already covered or ignored)
+#       - Layers: 3-14, 25-30, 32-45, 49, 52-55, 61
 #       - Same modules as above
 #       - Scheme: fp8, symmetric, per-channel, static (PTQ)
 #   * Additionally, dynamically quantize input activations for fp8-weighted layers:
@@ -29,8 +29,8 @@
 
 fp4_group = "15|16|17|18|19|20|21|22|23|24|31|46|47|48|50|56|57|58|59|60"
 fp8_group = (
-    "7|37|11|35|3|4|5|6|8|9|10|12|13|14|25|26|27|28|29|30|"
-    "32|33|34|36|38|39|40|41|42|43|44|45|49|52|53|54|55|61"
+    "3|4|5|6|7|8|9|10|11|12|13|14|25|26|27|28|29|30|"
+    "32|33|34|35|36|37|38|39|40|41|42|43|44|45|49|52|53|54|55|61"
 )
 
 recipe = f"""