Adding q_norm, k_norm support for quantized models (microsoft#1483)

satreysa · Sumedha Atreysa · kunal-vaishnavi · web-flow · commit 79d1d8470b74 · 2025-05-30T00:33:31.000Z
This PR adds support for q_norm and k_norm layers in quantized models
within the OGA framework.

Specifically, it introduces the following enhancements to
**quantized_model.py**:

- Initializes `q_norm` and `k_norm` as Tensor modules within the
`QuantizedAttention` and `QuantizedDecoder` classes.

- Maps the corresponding weights and biases for `q_norm` and `k_norm` to
the initialized tensor modules during model loading.

This enables accurate handling of models that include` q_norm` and
`k_norm` as part of their quantized attention mechanisms, improving
compatibility with newer quantized LLMs.

**Changes Made:**
- Added initialization of `q_norm` and `k_norm` as `Tensor` modules in:
  - `QuantizedAttention` class
  - `QuantizedDecoder` class
- Mapped corresponding weights and biases from the model to these tensor
modules during model loading
- Ensured consistency with the existing quantized attention
initialization flow

**Reviewer Notes:**
- Please verify:
- The initialization logic aligns with the handling of other norm layers
(e.g., `qkv_norm`)
- No side effects are introduced for models that do not contain `q_norm`
or `k_norm`
- Tested locally with a quantized model(Qwen3 models) containing
`q_norm`/`k_norm`, but additional validation with other architectures is
welcome

---------

Co-authored-by: Sumedha Atreysa &lt;satreysa@xlnx.xilinx.com&gt;
Co-authored-by: kunal-vaishnavi &lt;115581922+kunal-vaishnavi@users.noreply.github.com&gt;
diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py
@@ -63,7 +63,8 @@ def __init__(self, bits, group_size):
         self.v_proj = QuantizedTensorModule(bits, group_size)
         self.o_proj = QuantizedTensorModule(bits, group_size)
         self.rotary_emb = TensorModule()
-
+        self.k_norm = TensorModule()
+        self.q_norm = TensorModule()
 
 class QuantizedMLP:
     def __init__(self, bits, group_size):
@@ -149,6 +150,7 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
 
                         # Map weights and biases of norm, attention, and feed-forward network
                         # Graph order is input_layernorm --> q_proj/k_proj/v_proj --> o_proj --> post_attention_layernorm --> gate_proj/up_proj --> down_proj
+                        # If model uses q_norm and k_norm, graph order is input_layernorm --> q_norm/q_proj/k_norm/k_proj/v_proj --> o_proj --> post_attention_layernorm --> gate_proj/up_proj --> down_proj
                         if bool(re.match(r"^model.layers\.\d+\.input_layernorm\.weight$", name)):
                             # model.layers.layer_id.input_layernorm.weight
                             module.input_layernorm.weight = tensor
@@ -177,6 +179,12 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
                         elif bool(re.match(r"^model.layers\.\d+\.self_attn.q_proj\.bias$", name)):
                             # model.layers.layer_id.self_attn.q_proj.bias
                             module.self_attn.q_proj.bias = tensor
+                        elif bool(re.match(r"^model\.layers\.\d+\.self_attn\.q_norm\.weight$", name)):
+                            # model.layers.layer_id.self_attn.q_norm.weight
+                            module.self_attn.q_norm.weight = tensor   
+                        elif bool(re.match(r"^model\.layers\.\d+\.self_attn\.q_norm\.bias$", name)):
+                            # model.layers.layer_id.self_attn.q_norm.bias
+                            module.self_attn.q_norm.bias = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.self_attn.k_proj\.q?weight$", name)):
                             # model.layers.layer_id.self_attn.k_proj.qweight
                             # model.layers.layer_id.self_attn.k_proj.weight
@@ -195,6 +203,12 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
                         elif bool(re.match(r"^model.layers\.\d+\.self_attn.k_proj\.bias$", name)):
                             # model.layers.layer_id.self_attn.k_proj.bias
                             module.self_attn.k_proj.bias = tensor
+                        elif bool(re.match(r"^model\.layers\.\d+\.self_attn\.k_norm\.weight$", name)):
+                            # model.layers.layer_id.self_attn.k_norm.weight
+                            module.self_attn.k_norm.weight = tensor
+                        elif bool(re.match(r"^model\.layers\.\d+\.self_attn\.k_norm\.bias$", name)):
+                            # model.layers.layer_id.self_attn.k_norm.bias
+                            module.self_attn.k_norm.bias = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.self_attn.v_proj\.q?weight$", name)):
                             # model.layers.layer_id.self_attn.v_proj.qweight
                             # model.layers.layer_id.self_attn.v_proj.weight