[cross-entropy-loss] Added support for DFT flag (#860)

kashif · lancerts · web-flow · commit 4980c3fe77ac · 2025-08-21T20:12:59.000-07:00
## Summary Added support for a flag that turns on the DFT cross entropy loss from the paper https://arxiv.org/abs/2508.05629 - Hardware Type: cuda - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence --------- Co-authored-by: Shao Tang <tangshao28@gmail.com>
diff --git a/src/liger_kernel/ops/fused_linear_cross_entropy.py b/src/liger_kernel/ops/fused_linear_cross_entropy.py
@@ -26,6 +26,7 @@ def fused_linear_cross_entropy_forward(
     softcap=None,
     return_z_loss=False,
     accum_dtype=None,
+    use_token_scaling=False,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     device = _input.device
@@ -89,6 +90,23 @@ def fused_linear_cross_entropy_forward(
 
         n_rows = logits_chunk.shape[0]
 
+        # Compute predicted probabilities for token scaling if needed
+        if use_token_scaling:
+            # Compute softmax probabilities for scaling
+            # We need to compute this before the cross entropy kernel modifies logits_chunk
+            logits_for_softmax = logits_chunk.detach().clone()  # Detach to avoid gradient flow
+            if softcap is not None:
+                logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
+
+            # Compute softmax to get predicted probabilities
+            probs = torch.softmax(logits_for_softmax, dim=-1)
+
+            # Get the predicted probability for each target token
+            pred_probs = torch.gather(probs, -1, target_chunk.unsqueeze(-1)).squeeze(-1)
+
+            # Store the scaling factors
+            scaling_factors = pred_probs.detach()  # Detach to ensure no gradient flow
+
         # unreduced loss
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
@@ -123,11 +141,23 @@ def fused_linear_cross_entropy_forward(
             num_warps=32 if not is_hip() else 16,
         )
 
+        # Apply token scaling if requested
+        if use_token_scaling:
+            loss_1d_slice = loss_1d_slice * scaling_factors
+            if return_z_loss:
+                z_loss_1d_slice = z_loss_1d_slice * scaling_factors
+
         loss_1d[start_idx:end_idx] = loss_1d_slice
         if return_z_loss:
             z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
         grad_logits_chunk = logits_chunk  # chunk_size x V
 
+        # Apply token scaling to gradients if requested
+        if use_token_scaling:
+            # Expand scaling factors to match gradient dimensions
+            scaling_factors_expanded = scaling_factors.unsqueeze(-1)  # chunk_size x 1
+            grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
+
         grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
 
         if grad_weight is not None:
@@ -136,7 +166,7 @@ def fused_linear_cross_entropy_forward(
         if bias is not None:
             torch.add(
                 input=grad_bias,
-                other=logits_chunk.sum(dim=0),
+                other=grad_logits_chunk.sum(dim=0),
                 out=grad_bias,
                 alpha=1.0,
             )
@@ -146,6 +176,10 @@ def fused_linear_cross_entropy_forward(
     #     loss = loss_1d
     #     z_loss = z_loss_1d if return_z_loss else None
 
+    if reduction == "none":
+        # Return per-token losses
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss else None
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
@@ -221,6 +255,7 @@ def forward(
         softcap=None,
         return_z_loss: bool = False,
         accum_dtype=None,
+        use_token_scaling: bool = False,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
@@ -241,6 +276,9 @@ def forward(
         reduction: reduction to apply
         accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
             Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
+        use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
+            When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
+            Default: False.
         """
 
         loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
@@ -256,6 +294,7 @@ def forward(
             softcap=softcap,
             return_z_loss=return_z_loss,
             accum_dtype=accum_dtype,
+            use_token_scaling=use_token_scaling,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -288,4 +327,5 @@ def backward(ctx, grad_output, grad_output2):
             None,
             None,
             None,
+            None,  # use_token_scaling
         )
diff --git a/src/liger_kernel/transformers/functional.py b/src/liger_kernel/transformers/functional.py
@@ -65,6 +65,7 @@ def liger_fused_linear_cross_entropy(
     softcap: Optional[float] = None,
     return_z_loss: bool = False,
     accum_dtype=None,
+    use_token_scaling: bool = False,
 ):
     loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
         input,
@@ -79,6 +80,7 @@ def liger_fused_linear_cross_entropy(
         softcap,
         return_z_loss,
         accum_dtype,
+        use_token_scaling,
     )
     if not return_z_loss:
         return loss
diff --git a/src/liger_kernel/transformers/fused_linear_cross_entropy.py b/src/liger_kernel/transformers/fused_linear_cross_entropy.py
@@ -16,6 +16,7 @@ def __init__(
         softcap: Optional[float] = None,
         return_z_loss: bool = False,
         accum_dtype: Optional[torch.dtype] = None,
+        use_token_scaling: bool = False,
     ):
         super().__init__()
         assert (label_smoothing >= 0) and (label_smoothing <= 1), (
@@ -34,6 +35,7 @@ def __init__(
         self.softcap = softcap
         self.return_z_loss = return_z_loss
         self.accum_dtype = accum_dtype
+        self.use_token_scaling = use_token_scaling
 
     def forward(self, lin_weight, _input, target, bias=None):
         loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
@@ -49,6 +51,7 @@ def forward(self, lin_weight, _input, target, bias=None):
             self.softcap,
             self.return_z_loss,
             self.accum_dtype,
+            self.use_token_scaling,
         )
         if not self.return_z_loss:
             return loss
diff --git a/test/transformers/test_fused_linear_cross_entropy.py b/test/transformers/test_fused_linear_cross_entropy.py
@@ -352,3 +352,229 @@ def test_amp(B, T, H, V, bias, cast_dtype, accum_dtype, atol, rtol):
         atol=atol,
         rtol=rtol,
     )
+
+
+def test_correctness_token_scaling():
+    """Test that token scaling produces the correct loss values and gradients."""
+    B, T, H, V = 2, 4, 8, 16
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test using functional API with token scaling
+    loss_scaled = liger_fused_linear_cross_entropy(
+        input=_input,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ignore_index=-100,
+        reduction="none",  # Use "none" to get per-token losses
+        use_token_scaling=True,
+    )
+
+    # Compare with manual implementation
+    # Compute logits
+    logits = _input @ weight.t()
+    if bias is not None:
+        logits = logits + bias
+
+    # Compute standard cross entropy loss per token
+    ce_loss = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
+
+    # Compute predicted probabilities for target tokens
+    pred_probs = torch.softmax(logits, dim=-1).gather(1, target.unsqueeze(-1)).squeeze(-1).detach()
+
+    # Scale by predicted probabilities
+    expected_loss = ce_loss * pred_probs
+
+    # Check that losses are close
+    assert torch.allclose(loss_scaled, expected_loss, atol=1e-4, rtol=1e-4)
+
+    # Test gradients
+    loss_scaled.sum().backward(retain_graph=True)
+    grad_scaled = _input.grad.clone()
+    _input.grad.zero_()
+
+    expected_loss.sum().backward(retain_graph=True)
+    grad_expected = _input.grad.clone()
+    _input.grad.zero_()
+
+    # Check that gradients are close
+    assert torch.allclose(grad_scaled, grad_expected, atol=1e-4, rtol=1e-4)
+
+
+def test_correctness_token_scaling_consistency():
+    """Test that token scaling is consistent between functional and module APIs."""
+    B, T, H, V = 2, 4, 8, 16
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test functional API
+    loss_functional = liger_fused_linear_cross_entropy(
+        input=_input,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ignore_index=-100,
+        reduction="sum",
+        use_token_scaling=True,
+    )
+
+    # Test module API
+    ce_loss_module = LigerFusedLinearCrossEntropyLoss(
+        ignore_index=-100,
+        reduction="sum",
+        use_token_scaling=True,
+    )
+
+    loss_module = ce_loss_module(weight, _input, target, bias)
+
+    # Check that losses are identical
+    assert torch.allclose(loss_functional, loss_module, atol=1e-6, rtol=1e-6)
+
+    # Test gradients
+    loss_functional.backward(retain_graph=True)
+    grad_functional = _input.grad.clone()
+    _input.grad.zero_()
+
+    loss_module.backward(retain_graph=True)
+    grad_module = _input.grad.clone()
+    _input.grad.zero_()
+
+    # Check that gradients are identical
+    assert torch.allclose(grad_functional, grad_module, atol=1e-6, rtol=1e-6)
+
+
+def test_correctness_token_scaling_functional():
+    """Test token scaling using the functional API."""
+    B, T, H, V = 2, 4, 8, 16
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype)
+    x1 = _input.detach().clone().requires_grad_(True)
+    x2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test using functional API with token scaling
+    y1 = liger_fused_linear_cross_entropy(
+        input=x1,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ignore_index=-100,
+        lse_square_scale=0.0,
+        label_smoothing=0.0,
+        reduction="sum",  # Use sum for easier verification
+        softcap=None,
+        return_z_loss=False,
+        accum_dtype=None,
+        use_token_scaling=True,
+    )
+
+    # Compare with manual implementation
+    # Compute logits
+    logits = x2 @ weight.t()
+    if bias is not None:
+        logits = logits + bias
+
+    # Compute softmax probabilities
+    probs = torch.softmax(logits.detach(), dim=-1)  # Detach to avoid gradient flow
+
+    # Get predicted probabilities for target tokens
+    pred_probs = torch.gather(probs, -1, target.unsqueeze(-1)).squeeze(-1)
+
+    # Compute standard cross entropy loss
+    ce_loss = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
+
+    # Scale by predicted probabilities
+    scaled_loss = ce_loss * pred_probs
+
+    # Sum over all tokens
+    y2 = scaled_loss.sum()
+
+    # Check that losses are close
+    assert torch.allclose(y1, y2, atol=1e-5, rtol=1e-5)
+
+    # Test gradients
+    y1.backward()
+    y2.backward()
+
+    # Check that gradients are close
+    assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)
+
+
+def test_correctness_token_scaling_module():
+    """Test token scaling using the module API."""
+    B, T, H, V = 2, 4, 8, 16
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype)
+    x1 = _input.detach().clone().requires_grad_(True)
+    x2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Create module with token scaling
+    ce_loss = LigerFusedLinearCrossEntropyLoss(
+        ignore_index=-100,
+        reduction="sum",
+        use_token_scaling=True,
+    )
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test using module API with token scaling
+    y1 = ce_loss(weight, x1, target, bias)
+
+    # Compare with manual implementation
+    # Compute logits
+    logits = x2 @ weight.t()
+    if bias is not None:
+        logits = logits + bias
+
+    # Compute softmax probabilities
+    probs = torch.softmax(logits.detach(), dim=-1)  # Detach to avoid gradient flow
+
+    # Get predicted probabilities for target tokens
+    pred_probs = torch.gather(probs, -1, target.unsqueeze(-1)).squeeze(-1)
+
+    # Compute standard cross entropy loss
+    ce_loss_manual = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
+
+    # Scale by predicted probabilities
+    scaled_loss = ce_loss_manual * pred_probs
+
+    # Sum over all tokens
+    y2 = scaled_loss.sum()
+
+    # Check that losses are close
+    assert torch.allclose(y1, y2, atol=1e-5, rtol=1e-5)
+
+    # Test gradients
+    y1.backward()
+    y2.backward()
+
+    # Check that gradients are close
+    assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)