[Modules] Enhance Testing of l2warp (#448)

zhiyuan1i · web-flow · commit fd0d54bc44a9 · 2025-06-15T18:13:35.000+10:00
diff --git a/fla/modules/l2warp.py b/fla/modules/l2warp.py
@@ -11,22 +11,28 @@ class L2Wrap(torch.autograd.Function):
     This version is memory-optimized by not storing the full logits tensor.
     """
     @staticmethod
-    def forward(ctx, loss, logits):
-        ctx.save_for_backward(logits)
+    def forward(ctx, loss, logits, l2_penalty_factor=1e-4):
+        """
+        Forward pass for L2 penalty.
+        Args:
+            loss (torch.Tensor): The loss tensor.
+            logits (torch.Tensor): Shape[B, T, V] The logits tensor.
+            l2_penalty_factor (float): The factor for L2 penalty.
+        """
+        maxx, ids = torch.max(logits, dim=-1, keepdim=True)
+        ctx.logits_shape = logits.shape
+        factor = l2_penalty_factor / (logits.shape[0] * logits.shape[1])
+        maxx = maxx * factor
+        ctx.save_for_backward(maxx, ids)
         return loss
 
     @staticmethod
     def backward(ctx, grad_output):
-        logits = ctx.saved_tensors[0]
-
-        factor = 1e-4 / (logits.shape[0] * logits.shape[1])
-        maxx, ids = torch.max(logits, -1, keepdim=True)
-
-        glogits = torch.zeros_like(logits)
-        penalty_grad = maxx * factor
-        glogits.scatter_(-1, ids, penalty_grad)
-
-        return grad_output, glogits
+        maxx, ids = ctx.saved_tensors
+        glogits = torch.zeros(ctx.logits_shape, device=grad_output.device,
+                              dtype=grad_output.dtype)
+        glogits.scatter_(-1, ids, maxx)
+        return grad_output, glogits, None
 
 
 l2_warp = L2Wrap.apply
diff --git a/tests/modules/test_l2warp.py b/tests/modules/test_l2warp.py
@@ -15,7 +15,7 @@
 @pytest.mark.parametrize("T", [1024])
 @pytest.mark.parametrize("H", [256])
 @pytest.mark.parametrize("V", [2000])
-@pytest.mark.parametrize("l2_penalty_factor", [1e-4])
+@pytest.mark.parametrize("l2_penalty_factor", [1e-4, 1])
 @pytest.mark.skipif(
     is_intel_alchemist is True,
     reason="Intel Triton Failure"
@@ -41,7 +41,7 @@ def test_fused_linear_cross_entropy_l2_warp(
 
     ref_logits = F.linear(x.view(-1, H), lm_head.weight, lm_head.bias)
     ref_loss_ce = ref_criterion(ref_logits.view(B * T, V), shift_labels.view(-1))
-    ref_loss = standalone_l2_warp(ref_loss_ce, ref_logits.view(B, T, V))
+    ref_loss = standalone_l2_warp(ref_loss_ce, ref_logits.view(B, T, V), l2_penalty_factor)
 
     ref_loss.backward()
     ref_x_grad = x.grad.clone()
@@ -63,7 +63,7 @@ def test_fused_linear_cross_entropy_l2_warp(
     fused_w_grad = lm_head.weight.grad.clone()
     fused_b_grad = lm_head.bias.grad.clone()
 
-    ratio = 4e-3 if dtype == torch.bfloat16 else 1e-5
+    ratio = 4e-3 if dtype == torch.bfloat16 else 1e-3
 
     assert_close("Loss", ref_loss, fused_loss, ratio)
     assert_close("dx", ref_x_grad, fused_x_grad, ratio)