Option to return hard and soft loss when using distillation (#895)

h-aurelien-lac · lancerts · web-flow · commit 606ca4eb650b · 2025-10-27T23:59:25.000-07:00
## Summary
Proposition to add `return_soft_hard_loss` parameter to enable logging
of soft and hard losses separately. Useful for monitoring and analysis
during training

## Testing Done

- [x] test_jsd_loss.py
- [x] test_cosine_loss.py

---------

Co-authored-by: Shao Tang &lt;tangshao28@gmail.com&gt;
diff --git a/src/liger_kernel/chunked_loss/cosine_similarity_loss.py b/src/liger_kernel/chunked_loss/cosine_similarity_loss.py
@@ -1,3 +1,6 @@
+from typing import Tuple
+from typing import Union
+
 import torch
 import torch.nn.functional as F
 
@@ -41,7 +44,8 @@ def forward(
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
-    ):
+        return_soft_hard_loss: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         return super().forward(
             cls=cls,
             ctx=ctx,
@@ -59,11 +63,12 @@ def forward(
             ignore_index=ignore_index,
             temperature=temperature,
             compiled=compiled,
+            return_soft_hard_loss=return_soft_hard_loss,
         )
 
     @staticmethod
-    def backward(ctx, grad_output):
-        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
+    def backward(ctx, grad_output, *args):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
 
         return (
             *grads,
@@ -75,6 +80,7 @@ def backward(ctx, grad_output):
             None,  # temperature
             None,  # compiled
             None,  # chunk_size
+            None,  # return_soft_hard_loss
         )
 
 
@@ -88,6 +94,7 @@ def __init__(
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
     ):
         super().__init__()
         assert temperature != 0, "Temperature cannot be 0."
@@ -98,6 +105,7 @@ def __init__(
         self.compiled = compiled
         self.beta = beta
         self.chunk_size = chunk_size
+        self.return_soft_hard_loss = return_soft_hard_loss
 
     def forward(
         self,
@@ -108,7 +116,7 @@ def forward(
         true_labels: torch.LongTensor,
         student_bias: torch.Tensor = None,
         teacher_bias: torch.Tensor = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         return LigerFusedLinearCosineSimilarityFunction.apply(
             student_input,
             student_weight,
@@ -124,4 +132,5 @@ def forward(
             self.temperature,
             self.compiled,
             self.chunk_size,
+            self.return_soft_hard_loss,
         )
diff --git a/src/liger_kernel/chunked_loss/fused_linear_distillation.py b/src/liger_kernel/chunked_loss/fused_linear_distillation.py
@@ -1,5 +1,7 @@
 from abc import abstractmethod
 from functools import partial
+from typing import Tuple
+from typing import Union
 
 import torch
 
@@ -157,8 +159,9 @@ def forward(
         compute_ce_loss=True,
         temperature=1.0,
         compiled=True,
+        return_soft_hard_loss=False,
         **loss_kwargs,
-    ):
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """
         Base class for fused linear layer with distillation loss.
         Only need to compute gradients for student model.
@@ -180,13 +183,16 @@ def forward(
             compute_ce_loss (bool): Whether to compute CE loss.
             temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
             compiled (bool): Whether to use torch compile for chunk accumulation.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         CHUNK_SIZE = chunk_size
         grad_weight = torch.zeros_like(student_weight)
         grad_inputs = []
         grad_bias = torch.zeros_like(student_bias) if student_bias is not None else None
         loss_acc = torch.zeros((), device=student_input.device)
+        soft_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
+        hard_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
 
         loss_func_to_call = partial(
             LigerFusedLinearDistillationBase._compute_loss,
@@ -247,6 +253,9 @@ def accumulate_chunk(student_input_chunk, teacher_input_chunk, target_chunk):
                 )
             grad_weight.add_(chunk_grad_weight)
             loss_acc.add_(chunk_loss)
+            if return_soft_hard_loss:
+                soft_loss_acc.add_(chunk_soft_loss)
+                hard_loss_acc.add_(chunk_hard_loss)
             return chunk_grad_input
 
         if compiled:
@@ -268,10 +277,12 @@ def accumulate_chunk(student_input_chunk, teacher_input_chunk, target_chunk):
             grad_weight,
             grad_bias,
         )
+        if return_soft_hard_loss:
+            return loss_acc, soft_loss_acc, hard_loss_acc
         return loss_acc
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output, *args):
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
         if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
             grad_input = grad_input * grad_output
diff --git a/src/liger_kernel/chunked_loss/jsd_loss.py b/src/liger_kernel/chunked_loss/jsd_loss.py
@@ -1,5 +1,8 @@
 import math
 
+from typing import Tuple
+from typing import Union
+
 import torch
 import torch.nn.functional as F
 
@@ -56,6 +59,7 @@ def forward(
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
     ):
         """
         Fused linear layer with JSD distillation loss.
@@ -72,8 +76,9 @@ def forward(
             temperature (float): Temperature for softening/sharpening distributions
             compiled (bool): Whether to use torch compile
             chunk_size (int): Size of chunks for processing.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
         Returns:
-            torch.Tensor: Computed loss
+            torch.Tensor: Computed loss, or tuple (loss, soft_loss, hard_loss) if return_soft_hard_loss=True
         """
         return super().forward(
             cls=cls,
@@ -92,11 +97,12 @@ def forward(
             ignore_index=ignore_index,
             temperature=temperature,
             compiled=compiled,
+            return_soft_hard_loss=return_soft_hard_loss,
         )
 
     @staticmethod
-    def backward(ctx, grad_output):
-        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
+    def backward(ctx, grad_output, *args):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
 
         return (
             *grads,
@@ -108,6 +114,7 @@ def backward(ctx, grad_output):
             None,  # temperature
             None,  # compiled
             None,  # chunk_size
+            None,  # return_soft_hard_loss
         )
 
 
@@ -125,6 +132,7 @@ def __init__(
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
     ):
         """
         Args:
@@ -135,6 +143,7 @@ def __init__(
             compiled (bool): Whether to use torch compile
             beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
             chunk_size (int): Size of chunks for processing.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
         """
         super().__init__()
         assert temperature != 0, "Temperature cannot be 0."
@@ -145,6 +154,7 @@ def __init__(
         self.compiled = compiled
         self.beta = beta
         self.chunk_size = chunk_size
+        self.return_soft_hard_loss = return_soft_hard_loss
 
     def forward(
         self,
@@ -155,7 +165,7 @@ def forward(
         true_labels: torch.LongTensor,
         student_bias: torch.Tensor = None,
         teacher_bias: torch.Tensor = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """
         Compute the JSD distillation loss.
 
@@ -167,7 +177,9 @@ def forward(
             true_labels (torch.LongTensor): Target labels tensor
 
         Returns:
-            torch.Tensor: Computed loss
+            torch.Tensor or Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                If return_soft_hard_loss is False: Computed combined loss
+                If return_soft_hard_loss is True: Tuple of (combined_loss, soft_loss, hard_loss)
         """
         return LigerFusedLinearJSDFunction.apply(
             student_input,
@@ -184,4 +196,5 @@ def forward(
             self.temperature,
             self.compiled,
             self.chunk_size,
+            self.return_soft_hard_loss,
         )