Add CISPO and SAPO loss type support for Triton GRPO loss kernel (#1074)

yukiu00 · Tcc0403 · web-flow · commit cc14537eab5b · 2026-02-09T18:25:31.000+08:00
## Summary Add **CISPO** and **SAPO** loss type support to the Triton `ops/grpo_loss.py` kernel. This is a follow-up to: - #1054 - Add CISPO loss type support for LigerFusedLinearGRPOLoss (chunked loss path) - #1073 - Add SAPO loss type support for LigerFusedLinearGRPOLoss (chunked loss path) > **Note**: This PR depends on #1073 (SAPO PR) being merged first, as it builds on top of that branch. ### Background PR #1054 and #1073 added CISPO and SAPO support to the `chunked_loss` path, but the `ops` (Triton kernel) path was marked as a follow-up. This PR implements that follow-up. ### Changes **`src/liger_kernel/ops/grpo_loss.py`** - Add loss type constants (`_LOSS_TYPE_GRPO`, `_LOSS_TYPE_CISPO`, `_LOSS_TYPE_SAPO`) with `tl.constexpr` for compile-time branching - Implement CISPO in forward/backward kernels: - Upper-bound only clipping (no lower bound) - Detached coefficient (gradient only flows through logp) - Loss formula: `-coef_2 * advantage * logp` - Implement SAPO in forward/backward kernels: - Sigmoid-based soft gating instead of hard clipping - Different temperatures for positive/negative advantages - Loss formula: `-sigmoid(τ*(ρ-1)) * 4/τ * advantage` - Update `GrpoLossFunction` with `loss_type`, `sapo_temperature_pos`, `sapo_temperature_neg` parameters **`src/liger_kernel/transformers/grpo_loss.py`** - Remove error blocking for CISPO and SAPO loss types - Add `sapo_temperature_pos` and `sapo_temperature_neg` parameters - Update `_reduce_grpo_loss` to handle CISPO (DAPO normalization) and SAPO (GRPO normalization) **`test/transformers/test_grpo_loss.py`** - Add reference PyTorch implementations (`torch_cispo_loss`, `torch_sapo_loss`) - Add `test_cispo_loss` and `test_sapo_loss` test functions ## Testing Done - Hardware Type: NVIDIA GPU - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence --------- Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Co-authored-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com>
diff --git a/src/liger_kernel/chunked_loss/grpo_loss.py b/src/liger_kernel/chunked_loss/grpo_loss.py
@@ -339,6 +339,11 @@ def __init__(
             temperature (float): Temperature for the logits.
         """
         super().__init__()
+        # Validate SAPO temperatures to prevent division by zero or numerical instability
+        if sapo_temperature_pos <= 0:
+            raise ValueError(f"sapo_temperature_pos must be positive, got {sapo_temperature_pos}")
+        if sapo_temperature_neg <= 0:
+            raise ValueError(f"sapo_temperature_neg must be positive, got {sapo_temperature_neg}")
         self.beta = beta
         self.compiled = compiled
         self.use_ref_model = use_ref_model
diff --git a/src/liger_kernel/ops/grpo_loss.py b/src/liger_kernel/ops/grpo_loss.py
@@ -2,6 +2,21 @@
 import triton
 import triton.language as tl
 
+# Loss type constants for Triton constexpr branching
+# GRPO/DAPO/BNPO/DR_GRPO all use the same per-token loss computation (standard PPO clipping)
+_LOSS_TYPE_GRPO: tl.constexpr = tl.constexpr(0)
+_LOSS_TYPE_CISPO: tl.constexpr = tl.constexpr(1)
+_LOSS_TYPE_SAPO: tl.constexpr = tl.constexpr(2)
+
+_str_to_loss_type = {
+    "grpo": _LOSS_TYPE_GRPO.value,
+    "dapo": _LOSS_TYPE_GRPO.value,
+    "bnpo": _LOSS_TYPE_GRPO.value,
+    "dr_grpo": _LOSS_TYPE_GRPO.value,
+    "cispo": _LOSS_TYPE_CISPO.value,
+    "sapo": _LOSS_TYPE_SAPO.value,
+}
+
 
 @triton.jit
 def _selective_log_softmax_kernel(
@@ -83,6 +98,9 @@ def _grpo_loss_fwd_kernel(
     BETA: tl.constexpr,
     EPS_LOW,
     EPS_HIGH,
+    LOSS_TYPE: tl.constexpr,
+    SAPO_TEMP_POS,
+    SAPO_TEMP_NEG,
     L: tl.constexpr,
     N: tl.constexpr,
     BLOCK_N: tl.constexpr = 4096,
@@ -123,14 +141,33 @@ def _grpo_loss_fwd_kernel(
         OLD_LOGP += off_b * L + off_l
         old_logp = tl.load(OLD_LOGP).to(tl.float32)
     coef_1 = tl.exp(logp - old_logp)
-    coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
     advantage = tl.load(ADVANTAGES).to(tl.float32)
-    per_token_loss1 = coef_1 * advantage
-    per_token_loss2 = coef_2 * advantage
-    per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
-    is_low_clipped = (coef_1 < 1 - EPS_LOW) & (advantage < 0)
-    is_high_clipped = (coef_1 > 1 + EPS_HIGH) & (advantage > 0)
-    is_clipped = is_low_clipped | is_high_clipped
+
+    # Branch based on loss type
+    if LOSS_TYPE == 0:  # GRPO/DAPO/BNPO/DR_GRPO: standard PPO clipping
+        coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+        per_token_loss1 = coef_1 * advantage
+        per_token_loss2 = coef_2 * advantage
+        per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
+        is_low_clipped = (coef_1 < 1 - EPS_LOW) & (advantage < 0)
+        is_high_clipped = (coef_1 > 1 + EPS_HIGH) & (advantage > 0)
+        is_clipped = is_low_clipped | is_high_clipped
+
+    elif LOSS_TYPE == 1:  # CISPO: upper-bound only clipping, detached, multiply by logp
+        # Reference: MiniMax-M1 technical report
+        # https://github.com/huggingface/trl/blob/035c3ff151b953ca72cdfe0ee966bc1469a26fde/trl/trainer/grpo_trainer.py#L2030
+        coef_2 = tl.minimum(coef_1, EPS_HIGH)  # upper-bound only (EPS_HIGH is the raw bound for CISPO)
+        per_token_loss = -coef_2 * advantage * logp  # includes logp term
+        is_clipped = (coef_1 > EPS_HIGH) & (advantage > 0)
+
+    elif LOSS_TYPE == 2:  # SAPO: soft adaptive policy optimization with sigmoid gating
+        # Reference: https://huggingface.co/papers/2511.20347
+        # Formula: sigmoid(τ * (ρ - 1)) * 4 / τ
+        temperature = tl.where(advantage > 0, SAPO_TEMP_POS, SAPO_TEMP_NEG)
+        sigmoid_input = temperature * (coef_1 - 1.0)
+        sapo_coef = tl.sigmoid(sigmoid_input) * 4.0 / temperature
+        per_token_loss = -sapo_coef * advantage
+        is_clipped = 0.0  # SAPO has no clipping concept
 
     if BETA != 0.0:
         REF_LOGP += off_b * L + off_l
@@ -165,6 +202,9 @@ def _grpo_loss_bwd_kernel(
     BETA: tl.constexpr,
     EPS_LOW,
     EPS_HIGH,
+    LOSS_TYPE: tl.constexpr,
+    SAPO_TEMP_POS,
+    SAPO_TEMP_NEG,
     loss_stride0,
     loss_stride1,
     L: tl.constexpr,
@@ -202,13 +242,35 @@ def _grpo_loss_bwd_kernel(
         OLD_LOGP += off_b * L + off_l
         old_logp = tl.load(OLD_LOGP).to(tl.float32)
     coef_1 = tl.exp(logp - old_logp)
-    coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
     advantage = tl.load(ADVANTAGES).to(tl.float32)
-    per_token_loss1 = coef_1 * advantage
-    per_token_loss2 = coef_2 * advantage
-    mask = per_token_loss2 >= per_token_loss1
 
-    dlogp = -per_token_loss1 * mask
+    # Branch based on loss type for gradient computation
+    if LOSS_TYPE == 0:  # GRPO/DAPO/BNPO/DR_GRPO: standard PPO clipping
+        coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+        per_token_loss1 = coef_1 * advantage
+        per_token_loss2 = coef_2 * advantage
+        mask = per_token_loss2 >= per_token_loss1
+        dlogp = -per_token_loss1 * mask
+
+    elif LOSS_TYPE == 1:  # CISPO: coef_2 is DETACHED, so gradient only flows through logp
+        # loss = -coef_2 * advantage * logp, where coef_2 = clamp(coef_1, max=eps_high).detach()
+        # d(loss)/d(logp) = -coef_2 * advantage (coef_2 treated as constant due to detach)
+        coef_2 = tl.minimum(coef_1, EPS_HIGH)
+        dlogp = -coef_2 * advantage
+
+    elif LOSS_TYPE == 2:  # SAPO: gradient through sigmoid gating
+        # loss = -sapo_coef * advantage, where sapo_coef = sigmoid(τ*(ρ-1)) * 4/τ
+        # d(loss)/d(logp) = -advantage * d(sapo_coef)/d(coef_1) * d(coef_1)/d(logp)
+        # d(coef_1)/d(logp) = coef_1 (since coef_1 = exp(logp - old_logp))
+        # d(sapo_coef)/d(coef_1) = d/d(coef_1)[sigmoid(τ*(coef_1-1)) * 4/τ]
+        #                       = τ * sigmoid' * 4/τ = 4 * sigmoid * (1 - sigmoid)
+        # (the τ factors cancel out in the derivative)
+        temperature = tl.where(advantage > 0, SAPO_TEMP_POS, SAPO_TEMP_NEG)
+        sigmoid_input = temperature * (coef_1 - 1.0)
+        sigmoid_val = tl.sigmoid(sigmoid_input)
+        d_sapo_d_coef1 = 4.0 * sigmoid_val * (1.0 - sigmoid_val)
+        dlogp = -advantage * d_sapo_d_coef1 * coef_1
+
     if BETA != 0.0:
         REF_LOGP += off_b * L + off_l
         ref_logp = tl.load(REF_LOGP).to(tl.float32)
@@ -239,11 +301,28 @@ def forward(
         eps_low,
         eps_high,
         inplace,
+        loss_type="grpo",
+        sapo_temperature_pos=1.0,
+        sapo_temperature_neg=1.05,
     ):
         assert logits.is_contiguous() and completion_ids.is_contiguous()
         assert old_logp is None or old_logp.is_contiguous()
         assert (ref_logp is not None and ref_logp.is_contiguous()) if beta != 0.0 else True
 
+        # Validate loss_type
+        if loss_type not in _str_to_loss_type:
+            raise ValueError(f"Unknown loss_type '{loss_type}'. Supported types: {list(_str_to_loss_type.keys())}")
+
+        # Validate SAPO temperatures to prevent division by zero or numerical instability
+        if loss_type == "sapo":
+            if sapo_temperature_pos <= 0:
+                raise ValueError(f"sapo_temperature_pos must be positive, got {sapo_temperature_pos}")
+            if sapo_temperature_neg <= 0:
+                raise ValueError(f"sapo_temperature_neg must be positive, got {sapo_temperature_neg}")
+
+        # Convert loss_type string to integer for Triton constexpr
+        loss_type_int = _str_to_loss_type[loss_type]
+
         B, L_ADD_1, N = logits.shape
         L = L_ADD_1 - 1
 
@@ -270,21 +349,33 @@ def forward(
             beta,
             eps_low,
             eps_high,
+            loss_type_int,
+            sapo_temperature_pos,
+            sapo_temperature_neg,
             L,
             N,
             **kwargs,
         )
         ctx.save_for_backward(logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse)
-        ctx.infos = (temperature, beta, eps_low, eps_high, inplace)
-        # return loss
+        ctx.infos = (
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            inplace,
+            loss_type_int,
+            sapo_temperature_pos,
+            sapo_temperature_neg,
+        )
         return loss, kl, is_clipped
 
     @staticmethod
     def backward(ctx, *args):
         dloss = args[0]
-        # print(dloss.shape)
         logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse = ctx.saved_tensors
-        temperature, beta, eps_low, eps_high, inplace = ctx.infos
+        temperature, beta, eps_low, eps_high, inplace, loss_type_int, sapo_temperature_pos, sapo_temperature_neg = (
+            ctx.infos
+        )
         B, L_ADD_1, N = logits.shape
         L = L_ADD_1 - 1
         dlogits = logits.data if inplace else torch.empty_like(logits)
@@ -303,10 +394,15 @@ def backward(ctx, *args):
             beta,
             eps_low,
             eps_high,
+            loss_type_int,
+            sapo_temperature_pos,
+            sapo_temperature_neg,
             *dloss.stride(),
             L,
             N,
             **kwargs,
         )
         dlogits[:, -1, :] = 0
-        return dlogits, None, None, None, None, None, None, None, None, None, None
+        # Return None for: old_logp, ref_logp, completion_ids, advantages, completion_mask,
+        # temperature, beta, eps_low, eps_high, inplace, loss_type, sapo_temperature_pos, sapo_temperature_neg
+        return dlogits, None, None, None, None, None, None, None, None, None, None, None, None, None
diff --git a/src/liger_kernel/transformers/grpo_loss.py b/src/liger_kernel/transformers/grpo_loss.py
@@ -20,6 +20,8 @@ def triton_grpo_loss(
     max_completion_length=None,
     importance_sampling_level="token",
     reduce=False,
+    sapo_temperature_pos=1.0,
+    sapo_temperature_neg=1.05,
 ):
     assert logits is not None and completion_ids is not None and advantages is not None, (
         "must provide logits, completion_ids and advantages"
@@ -28,10 +30,6 @@ def triton_grpo_loss(
         raise ValueError(
             f"Triton GRPO loss only supports token-level importance sampling. Got {importance_sampling_level}."
         )
-    if loss_type == "cispo":
-        raise ValueError("Triton GRPO loss does not support loss_type='cispo'. Use the chunked GRPO loss path.")
-    if loss_type == "sapo":
-        raise ValueError("Triton GRPO loss does not support loss_type='sapo'. Use the chunked GRPO loss path.")
 
     per_token_loss, per_token_kl, is_clipped = GrpoLossFunction.apply(
         logits,
@@ -45,6 +43,9 @@ def triton_grpo_loss(
         eps_low,
         eps_high,
         inplace,
+        loss_type,
+        sapo_temperature_pos,
+        sapo_temperature_neg,
     )
     if not reduce:
         return per_token_loss, per_token_kl, is_clipped
@@ -69,7 +70,8 @@ def _reduce_grpo_loss(per_token_loss, completion_mask, loss_type, max_completion
         mask = torch.ones_like(per_token_loss, dtype=per_token_loss.dtype, device=per_token_loss.device)
     mask = mask.to(per_token_loss.dtype)
 
-    if loss_type == "grpo":
+    if loss_type == "grpo" or loss_type == "sapo":
+        # SAPO uses the same normalization as GRPO (per-sequence average)
         per_seq = (per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)
         return per_seq.mean()
     if loss_type == "bnpo":
@@ -79,7 +81,8 @@ def _reduce_grpo_loss(per_token_loss, completion_mask, loss_type, max_completion
             raise ValueError("max_completion_length must be provided when using loss_type='dr_grpo'")
         batch = per_token_loss.shape[0]
         return (per_token_loss * mask).sum() / (batch * max_completion_length)
-    if loss_type == "dapo":
+    if loss_type == "dapo" or loss_type == "cispo":
+        # CISPO uses the same normalization as DAPO
         normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(mask)
         return (per_token_loss * mask).sum() / normalizer
     raise ValueError(f"Unsupported loss_type '{loss_type}' for Triton GRPO loss.")
diff --git a/test/transformers/test_grpo_loss.py b/test/transformers/test_grpo_loss.py