[GRPO] add support for dapo loss (#939)

kashif · vaibhavjindal · web-flow · commit 397ab301bb5f · 2025-11-21T12:25:40.000-08:00
## Summary
&lt;!--- This is a required section; please describe the main purpose of
this proposed code change. ---&gt;
Add support for DAPO loss and made it the default loss

&lt;!---
## Details
This is an optional section; is there anything specific that reviewers
should be aware of?
---&gt;

## Testing Done
&lt;!--- This is a required section; please describe how this change was
tested. ---&gt;

&lt;!-- 
Replace BLANK with your device type. For example, A100-80G-PCIe

Complete the following tasks before sending your PR, and replace `[ ]`
with
`[x]` to indicate you have done them. 
--&gt;

- Hardware Type: &lt;BLANK&gt;
- [x] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [ ] run `make test-convergence` to ensure convergence

---------

Co-authored-by: Vaibhav Jindal &lt;vaibhav.jndl@gmail.com&gt;
diff --git a/src/liger_kernel/chunked_loss/fused_linear_ppo.py b/src/liger_kernel/chunked_loss/fused_linear_ppo.py
@@ -32,7 +32,7 @@ def forward(
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
         importance_sampling_level="token",
         temperature=1.0,
@@ -60,7 +60,7 @@ def forward(
             epsilon_low: Lower bound for clipping the importance sampling ratio
             epsilon_high: Upper bound for clipping the importance sampling ratio
             beta: Weight for the KL penalty
-            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo")
+            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo")
             max_completion_length: Maximum completion length required for "dr_grpo"
             temperature: Temperature for the logits
             compiled: Whether to use torch compile
@@ -244,6 +244,21 @@ def accumulate_chunk(
 
         return loss_acc, tuple(final_metrics)
 
+    @staticmethod
+    def _compute_dapo_normalizer(attention_mask):
+        """Global active tokens averaged per process."""
+        normalizer = attention_mask.to(torch.float32).sum()
+        world_size = 1
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            import torch.distributed as dist
+
+            normalizer = normalizer.clone()
+            dist.all_reduce(normalizer, op=dist.ReduceOp.SUM)
+            world_size = dist.get_world_size()
+
+        normalizer = normalizer / world_size
+        return torch.clamp(normalizer, min=1.0)
+
     @staticmethod
     def _compute_chunk_loss(
         input_chunk,
@@ -261,7 +276,7 @@ def _compute_chunk_loss(
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
         importance_sampling_level="token",
         temperature=1.0,
@@ -341,10 +356,11 @@ def backward(ctx, grad_output, *grad_metrics):
             None,  # grad_epsilon_low
             None,  # grad_epsilon_high
             None,  # grad_beta
+            None,  # grad_loss_type
+            None,  # grad_max_completion_length
+            None,  # grad_importance_sampling_level
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
             None,  # grad_chunk_size
-            None,  # grad_loss_type
-            None,  # grad_max_completion_length
         )
diff --git a/src/liger_kernel/chunked_loss/grpo_loss.py b/src/liger_kernel/chunked_loss/grpo_loss.py
@@ -29,7 +29,7 @@ def ppo_loss_fn(
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",  # ["grpo", "bnpo", "dr_grpo"]
+        loss_type="dapo",  # ["grpo", "bnpo", "dr_grpo", "dapo"]
         max_completion_length=None,  # Required for dr_grpo
         importance_sampling_level="token",  # ["token", "sequence"] - new parameter for GSPO
         **kwargs,
@@ -94,6 +94,9 @@ def ppo_loss_fn(
             if max_completion_length is None:
                 raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
             loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
+        elif loss_type == "dapo":
+            loss_normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(full_attention_mask)
+            loss = (per_token_loss * attention_mask).sum() / loss_normalizer
         else:
             raise ValueError(f"Unknown loss type: {loss_type}")
 
@@ -135,7 +138,7 @@ def forward(
         beta=0.04,
         epsilon_low=0.2,
         epsilon_high=0.2,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
         importance_sampling_level="token",
         temperature=1.0,
@@ -157,7 +160,7 @@ def forward(
             ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
             ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
             beta (float): Weight for the KL penalty
-            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits
@@ -235,7 +238,7 @@ def __init__(
         chunk_size: int = 1,
         epsilon_low: float = 0.2,
         epsilon_high: float = 0.2,
-        loss_type: str = "bnpo",
+        loss_type: str = "dapo",
         max_completion_length: Optional[int] = None,
         importance_sampling_level: str = "token",
         temperature: float = 1.0,
@@ -248,7 +251,7 @@ def __init__(
             chunk_size (int): Size of chunks for processing.
             epsilon_low (float): Lower bound for the importance sampling ratio.
             epsilon_high (float): Upper bound for the importance sampling ratio.
-            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits.
diff --git a/src/liger_kernel/ops/grpo_loss.py b/src/liger_kernel/ops/grpo_loss.py
@@ -128,7 +128,9 @@ def _grpo_loss_fwd_kernel(
     per_token_loss1 = coef_1 * advantage
     per_token_loss2 = coef_2 * advantage
     per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
-    is_clipped = per_token_loss1 < per_token_loss2
+    is_low_clipped = (coef_1 < 1 - EPS_LOW) & (advantage < 0)
+    is_high_clipped = (coef_1 > 1 + EPS_HIGH) & (advantage > 0)
+    is_clipped = is_low_clipped | is_high_clipped
 
     if BETA != 0.0:
         REF_LOGP += off_b * L + off_l
diff --git a/src/liger_kernel/transformers/grpo_loss.py b/src/liger_kernel/transformers/grpo_loss.py
@@ -1,3 +1,6 @@
+import torch
+
+from liger_kernel.chunked_loss.fused_linear_ppo import LigerFusedLinearPPOBase
 from liger_kernel.ops.grpo_loss import GrpoLossFunction
 
 
@@ -13,12 +16,20 @@ def triton_grpo_loss(
     eps_low=0.2,
     eps_high=0.4,
     inplace=True,
+    loss_type="dapo",
+    max_completion_length=None,
+    importance_sampling_level="token",
+    reduce=False,
 ):
     assert logits is not None and completion_ids is not None and advantages is not None, (
         "must provide logits、completion_ids and advantages"
     )
+    if importance_sampling_level != "token":
+        raise ValueError(
+            f"Triton GRPO loss only supports token-level importance sampling. Got {importance_sampling_level}."
+        )
 
-    return GrpoLossFunction.apply(
+    per_token_loss, per_token_kl, is_clipped = GrpoLossFunction.apply(
         logits,
         old_logp,
         ref_logp,
@@ -31,6 +42,50 @@ def triton_grpo_loss(
         eps_high,
         inplace,
     )
+    if not reduce:
+        return per_token_loss, per_token_kl, is_clipped
+
+    loss = _reduce_grpo_loss(
+        per_token_loss,
+        completion_mask,
+        loss_type=loss_type,
+        max_completion_length=max_completion_length,
+    )
+
+    metrics = []
+    if beta != 0.0 and per_token_kl is not None:
+        metrics.append(_masked_mean(per_token_kl, completion_mask))
+    metrics.append(_masked_mean(is_clipped.float(), completion_mask))
+    return loss, metrics
+
+
+def _reduce_grpo_loss(per_token_loss, completion_mask, loss_type, max_completion_length):
+    mask = completion_mask
+    if mask is None:
+        mask = torch.ones_like(per_token_loss, dtype=per_token_loss.dtype, device=per_token_loss.device)
+    mask = mask.to(per_token_loss.dtype)
+
+    if loss_type == "grpo":
+        per_seq = (per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)
+        return per_seq.mean()
+    if loss_type == "bnpo":
+        return (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0)
+    if loss_type == "dr_grpo":
+        if max_completion_length is None:
+            raise ValueError("max_completion_length must be provided when using loss_type='dr_grpo'")
+        batch = per_token_loss.shape[0]
+        return (per_token_loss * mask).sum() / (batch * max_completion_length)
+    if loss_type == "dapo":
+        normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(mask)
+        return (per_token_loss * mask).sum() / normalizer
+    raise ValueError(f"Unsupported loss_type '{loss_type}' for Triton GRPO loss.")
+
+
+def _masked_mean(values, mask):
+    if mask is None:
+        mask = torch.ones_like(values, dtype=values.dtype, device=values.device)
+    mask = mask.to(values.dtype)
+    return (values * mask).sum() / mask.sum().clamp(min=1.0)
 
 
 # This is a demo how to use grpo_loss in GRPOTrainer. The Trl version must be 0.16
diff --git a/test/chunked_loss/test_grpo_loss.py b/test/chunked_loss/test_grpo_loss.py