[NPU]: update the native KLDivLoss implementation for comparison. (eg.)test_jsd.py (#1032)

kiritorl · web-flow · commit b708f79d0250 · 2026-01-20T16:13:34.000+08:00
## Summary This PR modifies the NPU test reference for KLDivLoss. Since the native NPU KLDivLoss operator does not support gradients w.r.t. the target [#1021 ](#1021) it caused failures in test_jsd.py (where input and target are swapped when beta != 0). To resolve this, I replaced the native operator usage with a custom implementation using basic math operations. This allows correct gradient computation for the target and aligns the x1.grad results with the Triton kernel implementation. ## Testing Done I tested test_jsd,test_fused_linear_jsd by following method and all cases passed: pytest -v test/transformers/test_jsd.py pytest -v test/transformers/test_fused_linear_jsd.py Hardware Type: Ascend NPU 910B3 - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence
diff --git a/test/transformers/test_jsd.py b/test/transformers/test_jsd.py
@@ -18,6 +18,31 @@
 set_seed(42)
 
 
+class NPUKLDivLoss(torch.nn.Module):
+    """
+    A custom KLDivLoss for NPU.
+
+    On NPU devices, torch.nn.KLDivLoss does not compute gradients with respect to the target.
+    This leads to incorrect gradient computation when the target depends on the input,
+    such as in JSD or reverse KLDiv.
+    See https://github.com/linkedin/Liger-Kernel/issues/1021 for more details.
+    """
+
+    def __init__(self, reduction="none", log_target=True):
+        super().__init__()
+
+    def forward(self, input, target):
+        original_dtype = input.dtype
+
+        if input.dtype in [torch.float16, torch.bfloat16]:
+            input = input.float()
+            target = target.float()
+
+        loss = torch.exp(target) * (target - input)
+
+        return loss.to(original_dtype)
+
+
 class JSD(torch.nn.Module):
     def __init__(
         self,
@@ -26,7 +51,10 @@ def __init__(
         dtype: torch.dtype = torch.float,
     ):
         super(JSD, self).__init__()
-        self.kl = KLDivLoss(reduction="none", log_target=True)
+        if device == "npu":
+            self.kl = NPUKLDivLoss(reduction="none", log_target=True)
+        else:
+            self.kl = KLDivLoss(reduction="none", log_target=True)
         self.beta = beta
         self.ignore_index = ignore_index
         self.dtype = dtype