Fix: fix ignore_index not being applied in JSD distillation loss (#974)

roycho96 · Sunghyun Cho · web-flow · commit 33f7f487f1b8 · 2025-12-15T00:44:22.000+08:00
## Summary

Fix `ignore_index` parameter not being applied in
`LigerFusedLinearJSDLoss`.

The `ignore_index` parameter was accepted but never used in
`distillation_loss_fn`, causing all tokens (including padding/prompt) to
be included in loss computation.

### Changes
- Change `reduction='sum'` to `reduction='none'` for per-token masking
- Use `masked_fill` for dtype preservation (prevent bf16 → fp32
promotion)
- Add `clamp_min(1)` to prevent NaN when all tokens ignored
- Normalize by `num_valid_tokens` instead of `full_target.shape[0]`
- Add comprehensive ignore_index tests

## Testing Done

- Hardware Type: H100
- [x] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [x] run `make test-convergence` to ensure convergence

---------

Co-authored-by: Sunghyun Cho &lt;roycho@lgcns.com&gt;
diff --git a/src/liger_kernel/chunked_loss/fused_linear_distillation.py b/src/liger_kernel/chunked_loss/fused_linear_distillation.py
@@ -132,10 +132,15 @@ def _compute_loss(
             )
             student_logits_chunk = torch.cat([student_logits_chunk, pad_tensor], dim=-1)
 
-        hard_loss /= full_target.shape[0]
+        num_valid_tokens = (full_target != ignore_index).sum()
+        num_valid_tokens = num_valid_tokens.clamp_min(1)  # to avoid division by zero
 
-        soft_loss = distillation_loss_fn(student_logits_chunk, teacher_logits_chunk, **loss_kwargs)
-        soft_loss /= full_target.shape[0]
+        hard_loss /= num_valid_tokens
+
+        soft_loss = distillation_loss_fn(
+            student_logits_chunk, teacher_logits_chunk, target=target_chunk, ignore_index=ignore_index, **loss_kwargs
+        )
+        soft_loss /= num_valid_tokens
 
         loss = weight_hard_loss * hard_loss + weight_soft_loss * soft_loss
         return loss, (soft_loss, hard_loss, student_logits_chunk, teacher_logits_chunk)
diff --git a/src/liger_kernel/chunked_loss/jsd_loss.py b/src/liger_kernel/chunked_loss/jsd_loss.py
@@ -11,35 +11,50 @@
 
 class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
     @staticmethod
-    def distillation_loss_fn(student_logits, teacher_logits, beta=0.5):
+    def distillation_loss_fn(student_logits, teacher_logits, beta=0.5, target=None, ignore_index=-100):
         """
         Compute JSD loss (Jensen-Shannon Divergence Loss).
         Args:
             student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
             teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
             beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+            target (torch.Tensor): Target labels for masking. Shape: (chunk_size,).
+            ignore_index (int): Index to ignore in loss computation.
         Returns:
             torch.Tensor: Jensen-Shannon Divergence loss
+        Note:
+            - Uses reduction="none" to preserve per-token losses for masking
+            - KL divergence requires summing over vocab dimension (not mean)
+            - Masking excludes padding/prompt tokens from loss computation
         """
         student_log_probs = F.log_softmax(student_logits, dim=-1)
         teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
 
         if beta == 0:
-            jsd_loss = F.kl_div(student_log_probs, teacher_log_probs, reduction="sum", log_target=True)
+            jsd_loss = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True)
         elif beta == 1:
-            jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="sum", log_target=True)
+            jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
         else:
             # Compute probabilities (only required for mean calculation)
             log_mean_probs = torch.logsumexp(
                 torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
             )
 
-            student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="sum", log_target=True)
-            teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="sum", log_target=True)
+            student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="none", log_target=True)
+            teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="none", log_target=True)
 
             # JSD is the weighted average of the KL divergences
             jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
-        return jsd_loss
+
+        # Sum over vocab dimension (KL divergence definition)
+        jsd_loss = jsd_loss.sum(dim=-1)  # (chunk_size,)
+
+        # Apply ignore_index mask
+        if target is not None:
+            mask = target != ignore_index
+            jsd_loss = jsd_loss.masked_fill(~mask, 0.0)
+
+        return jsd_loss.sum()
 
     @classmethod
     def forward(
diff --git a/test/chunked_loss/test_jsd_loss.py b/test/chunked_loss/test_jsd_loss.py
@@ -37,12 +37,14 @@ def __init__(
             temperature=temperature,
         )
 
-    def distillation_loss(self, student_logits, teacher_logits, beta=0.5):
+    def distillation_loss(self, student_logits, teacher_logits, target=None, ignore_index=-100, beta=0.5):
         """
         Compute JSD loss (Jensen-Shannon Divergence Loss).
         Args:
-            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
-            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
+            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len, vocab_size).
+            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len, vocab_size).
+            target (torch.Tensor): Target labels for masking. Shape: (batch_size * seq_len,).
+            ignore_index (int): Index to ignore in loss computation.
             beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
         Returns:
             torch.Tensor: Jensen-Shannon Divergence loss
@@ -55,17 +57,24 @@ def distillation_loss(self, student_logits, teacher_logits, beta=0.5):
         elif beta == 1:
             jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
         else:
-            # Compute probabilities (only required for mean calculation)
             log_mean_probs = torch.logsumexp(
                 torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
             )
+            student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="none", log_target=True)
+            teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="none", log_target=True)
+            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
 
-            student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="batchmean", log_target=True)
-            teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="batchmean", log_target=True)
+        # Sum over vocab dimension
+        jsd_loss = jsd_loss.sum(dim=-1)
 
-            # JSD is the weighted average of the KL divergences
-            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
-        return jsd_loss
+        # Apply ignore_index mask
+        if target is not None:
+            mask = target != ignore_index
+            jsd_loss = jsd_loss * mask.float()
+            num_valid_tokens = mask.sum().clamp_min(1)
+            return jsd_loss.sum() / num_valid_tokens
+
+        return jsd_loss.sum()
 
 
 class TorchLMHeadJSD(torch.nn.Module):
@@ -182,6 +191,7 @@ def forward(self, student_input, teacher_input, target):
         (0.5, 1.0, 0.0, 0.2),
     ],
 )
+@pytest.mark.parametrize("ignore_index", [-100, 42])
 def test_correctness(
     B,
     T,
@@ -196,6 +206,7 @@ def test_correctness(
     weight_hard_loss,
     weight_soft_loss,
     beta,
+    ignore_index,
 ):
     torch_lm_head_jsd = TorchLMHeadJSD(
         H=H,
@@ -207,6 +218,7 @@ def test_correctness(
         weight_hard_loss=weight_hard_loss,
         weight_soft_loss=weight_soft_loss,
         beta=beta,
+        ignore_index=ignore_index,
     )
     liger_lm_head_jsd = LigerLMHeadJSD(
         H=H,
@@ -218,6 +230,7 @@ def test_correctness(
         weight_hard_loss=weight_hard_loss,
         weight_soft_loss=weight_soft_loss,
         beta=beta,
+        ignore_index=ignore_index,
     )
 
     torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
@@ -243,6 +256,11 @@ def test_correctness(
 
     target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
 
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target[indices_to_assign] = ignore_index
+
+    # Assign some random number of elements as ignore_index
     loss1 = torch_lm_head_jsd(student_input1, teacher_input, target)
     loss2 = liger_lm_head_jsd(student_input2, teacher_input, target)
     assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
diff --git a/test/utils.py b/test/utils.py
@@ -1033,7 +1033,9 @@ def get_batch_loss_metrics(
         student_logits /= self.temperature
         teacher_logits /= self.temperature
 
-        soft_loss = self.distillation_loss(student_logits, teacher_logits, **loss_kwargs)
+        soft_loss = self.distillation_loss(
+            student_logits, teacher_logits, target=target, ignore_index=self.ignore_index, **loss_kwargs
+        )
         # full loss
         loss = self.weight_hard_loss * hard_loss + self.weight_soft_loss * soft_loss
         return loss