[Cross-entropy] get valid predicted probabilities (#864)

kashif · web-flow · commit fa24166141d0 · 2025-08-26T07:28:01.000-07:00
## Summary Forgot to check for ignored tokens when calculating probabilities in #860 - Hardware Type: cuda - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence
diff --git a/src/liger_kernel/ops/fused_linear_cross_entropy.py b/src/liger_kernel/ops/fused_linear_cross_entropy.py
@@ -101,8 +101,21 @@ def fused_linear_cross_entropy_forward(
             # Compute softmax to get predicted probabilities
             probs = torch.softmax(logits_for_softmax, dim=-1)
 
-            # Get the predicted probability for each target token
-            pred_probs = torch.gather(probs, -1, target_chunk.unsqueeze(-1)).squeeze(-1)
+            # Get predicted probabilities for token scaling, handling ignored targets
+            valid_target_mask = target_chunk != ignore_index
+            valid_targets = target_chunk[valid_target_mask]
+
+            if len(valid_targets) > 0:
+                # Gather probabilities only for valid targets
+                valid_probs = probs[valid_target_mask]
+                pred_probs_valid = torch.gather(valid_probs, -1, valid_targets.unsqueeze(-1)).squeeze(-1)
+
+                # Create full tensor with zeros for ignored targets
+                pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
+                pred_probs[valid_target_mask] = pred_probs_valid
+            else:
+                # All targets are ignored
+                pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
 
             # Store the scaling factors
             scaling_factors = pred_probs.detach()  # Detach to ensure no gradient flow
diff --git a/src/liger_kernel/transformers/model/loss_utils.py b/src/liger_kernel/transformers/model/loss_utils.py
@@ -25,6 +25,7 @@ def fixed_fused_linear_cross_entropy(
         ignore_index=ignore_index,
         softcap=final_logit_softcapping,
         accum_dtype=accum_dtype,
+        **kwargs,
     )
     if reduction == "sum":
         loss = loss / num_items_in_batch
diff --git a/test/transformers/test_fused_linear_cross_entropy.py b/test/transformers/test_fused_linear_cross_entropy.py
@@ -578,3 +578,41 @@ def test_correctness_token_scaling_module():
 
     # Check that gradients are close
     assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)
+
+
+def test_token_scaling_with_ignore_index():
+    """Test token scaling when some targets have ignore_index values."""
+    B, T, H, V = 2, 4, 8, 1000
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
+
+    # Create targets with some ignore_index values (-100)
+    target = torch.tensor([0, 100, -100, 500, -100, 999], device=device, dtype=torch.long)
+    _input = torch.randn(6, H, device=device, dtype=dtype, requires_grad=True)  # Adjust input size
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test using functional API with token scaling
+    loss_scaled = liger_fused_linear_cross_entropy(
+        input=_input,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ignore_index=-100,
+        reduction="sum",
+        use_token_scaling=True,
+    )
+
+    # This should not raise any CUDA errors
+    assert loss_scaled.numel() == 1  # Should return a scalar for sum reduction
+    assert not torch.isnan(loss_scaled)  # Should not be NaN
+    assert not torch.isinf(loss_scaled)  # Should not be infinite
+
+    # Test gradients
+    loss_scaled.backward()
+    assert _input.grad is not None
+    assert not torch.isnan(_input.grad).any()  # Gradients should not be NaN

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ def fixed_fused_linear_cross_entropy(`
`25`	`25`	`ignore_index=ignore_index,`
`26`	`26`	`softcap=final_logit_softcapping,`
`27`	`27`	`accum_dtype=accum_dtype,`
	`28`	`+ **kwargs,`
`28`	`29`	`)`
`29`	`30`	`if reduction == "sum":`
`30`	`31`	`loss = loss / num_items_in_batch`