Fix static attention RoPE implementation

sxu · facebook-github-bot · commit 10ad5dffa6b0 · 2025-06-18T19:15:17.000-07:00
Differential Revision: D76951243
diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
@@ -352,7 +352,7 @@ def forward(
             x_r, x_i = x[..., ::2], x[..., 1::2]
             x_out_r = x_r * freqs_cos - x_i * freqs_sin
             x_out_i = x_r * freqs_sin + x_i * freqs_cos
-            x_out = torch.cat([x_out_r, x_out_i], dim=-1)
+            x_out = torch.stack([x_out_r, x_out_i], dim=-1).flatten(2)
             return x_out
 
 
diff --git a/examples/models/llama/tests/test_static_attention.py b/examples/models/llama/tests/test_static_attention.py
@@ -30,6 +30,14 @@ def test(use_qk_norm, use_conv2d):
             rope = Rope(config)
             attn_mha = AttentionMHA(config, layer_id, rope).eval()
             static_attn = StaticAttention(config, layer_id, rope).eval()
+            if use_qk_norm:
+                with torch.no_grad():
+                    attn_mha.q_norm_fn.weight.copy_(
+                        torch.rand(config.head_dim) * 0.2 + 0.9
+                    )
+                    attn_mha.k_norm_fn.weight.copy_(
+                        torch.rand(config.head_dim) * 0.2 + 0.9
+                    )
             static_attn.load_weights_from_attention_mha(attn_mha)
             if use_conv2d:
                 static_attn.linear_to_conv2d()