Static attention: support overriding RMSNorm class

sxu · facebook-github-bot · commit 0432fe2059cc · 2025-07-24T14:50:45.000-07:00
Summary: CoreML backend should not use the default torch.nn.RMSNorm because it leads to worse precision, where as QNN backend should as it leads to a single operator that's more efficient. Support overriding the normalization layer class.

Differential Revision: D78926603
diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
@@ -840,7 +840,9 @@ def _forward_mha(
 
         return y.transpose(1, 2).contiguous().view(bsz, seq_len, -1), out_cache_state
 
-    def load_weights_from_attention_mha(self, other: AttentionMHA):
+    def load_weights_from_attention_mha(
+        self, other: AttentionMHA, rms_norm_class=torch.nn.RMSNorm
+    ):
         if self.split_mha:
             for i in range(self.n_heads):
                 self.wqs[i].weight.data.copy_(
@@ -864,9 +866,13 @@ def load_weights_from_attention_mha(self, other: AttentionMHA):
         if other.use_qk_norm:
             self.use_qk_norm = True
             self.qk_norm_before_rope = other.qk_norm_before_rope
-            self.q_norm = torch.nn.RMSNorm(other.q_norm_fn.dim, other.q_norm_fn.eps)
+            self.q_norm = rms_norm_class(other.q_norm_fn.dim, other.q_norm_fn.eps).to(
+                other.q_norm_fn.weight.dtype
+            )
             self.q_norm.load_state_dict(other.q_norm_fn.state_dict())
-            self.k_norm = torch.nn.RMSNorm(other.k_norm_fn.dim, other.k_norm_fn.eps)
+            self.k_norm = rms_norm_class(other.k_norm_fn.dim, other.k_norm_fn.eps).to(
+                other.k_norm_fn.weight.dtype
+            )
             self.k_norm.load_state_dict(other.k_norm_fn.state_dict())
 
     def adopt_hf_rope(self):