Fix nan loss error for LigerFusedLinearJSDLoss (#862)

ParagEkbote · web-flow · commit a089cd5900ca · 2025-08-22T20:39:48.000-07:00
## Summary Fixes #769 As described in the issue, I have updated the code to fix the nan error. Could you please review? cc: @shimizust ## Details - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence
diff --git a/src/liger_kernel/chunked_loss/jsd_loss.py b/src/liger_kernel/chunked_loss/jsd_loss.py
@@ -1,3 +1,5 @@
+import math
+
 import torch
 import torch.nn.functional as F
 
@@ -25,8 +27,9 @@ def distillation_loss_fn(student_logits, teacher_logits, beta=0.5):
             jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="sum", log_target=True)
         else:
             # Compute probabilities (only required for mean calculation)
-            mean_probs = (1 - beta) * student_log_probs.exp() + beta * teacher_log_probs.exp()
-            log_mean_probs = mean_probs.log()
+            log_mean_probs = torch.logsumexp(
+                torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
+            )
 
             student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="sum", log_target=True)
             teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="sum", log_target=True)
diff --git a/test/chunked_loss/test_jsd_loss.py b/test/chunked_loss/test_jsd_loss.py
@@ -1,3 +1,5 @@
+import math
+
 import pytest
 import torch
 import torch.nn.functional as F
@@ -54,8 +56,9 @@ def distillation_loss(self, student_logits, teacher_logits, beta=0.5):
             jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
         else:
             # Compute probabilities (only required for mean calculation)
-            mean_probs = (1 - beta) * student_log_probs.exp() + beta * teacher_log_probs.exp()
-            log_mean_probs = mean_probs.log()
+            log_mean_probs = torch.logsumexp(
+                torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
+            )
 
             student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="batchmean", log_target=True)
             teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="batchmean", log_target=True)