Add CISPO loss type support for LigerFusedLinearGRPOLoss (#1054)

yukiu00 · Tcc0403 · web-flow · commit 83cdcf82f1fc · 2026-02-04T22:42:48.000+08:00
## Summary Resolve: #1057 * Add **CISPO** (`loss_type="cispo"`) support to **`LigerFusedLinearGRPOLoss`** (chunked loss path) * Enable TRL's **`GRPOTrainer`** to work with `use_liger_kernel=True` and `loss_type="cispo"` ### Background / Motivation CISPO (Clipped Importance Sampling Policy Optimization) is a loss variant proposed in the **MiniMax-M1** technical report. It clips the importance sampling ratio with **only an upper bound** and **detaches it from gradient computation**. TRL added `loss_type="cispo"` to `GRPOTrainer`, but Liger Kernel did not support it, causing errors when using `use_liger_kernel=True` with `loss_type="cispo"`. ### Changes **`src/liger_kernel/chunked_loss/grpo_loss.py`** * Add CISPO loss matching TRL's implementation * Clip importance sampling ratio with **upper bound only** and **detach**: ```python clamped_ratios = torch.clamp(coef_1, max=epsilon_high).detach() ``` * Use **DAPO-style normalization** for CISPO reduction (consistent with TRL) * Add CISPO-specific clip metric for logging compatibility: * Count tokens where `(coef_1 > epsilon_high) & (advantages > 0)` **`src/liger_kernel/transformers/grpo_loss.py`** * Add CISPO reduction logic (uses same normalizer as DAPO) * Raise explicit error for Triton GRPO loss path (CISPO not supported there) **`ops/grpo_loss` (Triton fused path)** * CISPO is **not implemented** in `ops/grpo_loss` in this PR * `loss_type="cispo"` is **only supported via chunked loss path** (Triton fused support is a follow-up) **`test/chunked_loss/test_grpo_loss.py`** * Add CISPO to torch reference implementation (`TorchLMHeadGRPO`) * Add `"cispo"` to parameterized test cases to verify parity with reference ### References * MiniMax-M1 (CISPO introduction): https://arxiv.org/abs/2506.13585 * DAPO (normalization / reduction reference): https://arxiv.org/abs/2503.14476 * TRL CISPO implementation: https://github.com/huggingface/trl/blob/035c3ff151b953ca72cdfe0ee966bc1469a26fde/trl/trainer/grpo_trainer.py#L2030 ## Testing Done - Hardware Type: RTX3090 24GB (NVIDIA Ampere) - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence --------- Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Co-authored-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com>
diff --git a/src/liger_kernel/chunked_loss/fused_linear_ppo.py b/src/liger_kernel/chunked_loss/fused_linear_ppo.py
@@ -60,7 +60,7 @@ def forward(
             epsilon_low: Lower bound for clipping the importance sampling ratio
             epsilon_high: Upper bound for clipping the importance sampling ratio
             beta: Weight for the KL penalty
-            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo")
+            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo", "cispo")
             max_completion_length: Maximum completion length required for "dr_grpo"
             temperature: Temperature for the logits
             compiled: Whether to use torch compile
diff --git a/src/liger_kernel/chunked_loss/grpo_loss.py b/src/liger_kernel/chunked_loss/grpo_loss.py
@@ -11,8 +11,21 @@ def k3_loss_fn(log_p, log_q):
     return torch.exp(log_p - log_q) - (log_p - log_q) - 1.0
 
 
-def clip_coef_fn(coef, epsilon_low, epsilon_high):
-    return torch.clamp(coef, 1 - epsilon_low, 1 + epsilon_high)
+def clip_coef_fn(coef, epsilon_low, epsilon_high, loss_type):
+    if loss_type == "cispo":
+        # CISPO: clip and detach the importance weights
+        upper_bound = epsilon_high
+        lower_bound = None
+        clipped_coef = torch.clamp(coef, lower_bound, upper_bound).detach()
+        is_lower_clipped = False
+        is_upper_clipped = coef > upper_bound
+    else:
+        upper_bound = 1 + epsilon_high
+        lower_bound = 1 - epsilon_low
+        clipped_coef = torch.clamp(coef, lower_bound, upper_bound)
+        is_lower_clipped = coef < lower_bound
+        is_upper_clipped = coef > upper_bound
+    return clipped_coef, is_lower_clipped, is_upper_clipped
 
 
 class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
@@ -29,7 +42,7 @@ def ppo_loss_fn(
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="dapo",  # ["grpo", "bnpo", "dr_grpo", "dapo"]
+        loss_type="dapo",  # ["grpo", "bnpo", "dr_grpo", "dapo", "cispo"]
         max_completion_length=None,  # Required for dr_grpo
         importance_sampling_level="token",  # ["token", "sequence"] - new parameter for GSPO
         **kwargs,
@@ -67,10 +80,15 @@ def ppo_loss_fn(
         # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
         # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
         coef_1 = torch.exp(log_importance_weights)
-        coef_2 = clip_coef_fn(coef_1, epsilon_low, epsilon_high)
-        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
-        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
-        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        coef_2, is_lower_clipped, is_upper_clipped = clip_coef_fn(coef_1, epsilon_low, epsilon_high, loss_type)
+        if loss_type == "cispo":
+            # CISPO: clip and detach the importance weights, multiply by log probs
+            # Reference: https://github.com/huggingface/trl/blob/035c3ff151b953ca72cdfe0ee966bc1469a26fde/trl/trainer/grpo_trainer.py#L2030
+            per_token_loss = -coef_2 * advantages.unsqueeze(1) * per_token_logps
+        else:
+            per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+            per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+            per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
         if beta != 0.0:
             # Compute KL penalty (approximates KL[per_token_logps, ref_per_token_logps])
             kl_div = k3_loss_fn(ref_per_token_logps, per_token_logps)
@@ -94,7 +112,7 @@ def ppo_loss_fn(
             if max_completion_length is None:
                 raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
             loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
-        elif loss_type == "dapo":
+        elif loss_type == "dapo" or loss_type == "cispo":
             loss_normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(full_attention_mask)
             loss = (per_token_loss * attention_mask).sum() / loss_normalizer
         else:
@@ -107,15 +125,15 @@ def ppo_loss_fn(
 
         # Adjust clipping metric calculation based on importance sampling level
         if importance_sampling_level == "token":
-            is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
-                (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
+            is_clipped = (is_lower_clipped & (advantages.unsqueeze(1) < 0)) | (
+                is_upper_clipped & (advantages.unsqueeze(1) > 0)
             )
         else:  # sequence level
             # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
-            is_clipped = ((coef_1.squeeze(-1) < 1 - epsilon_low) & (advantages < 0)) | (
-                (coef_1.squeeze(-1) > 1 + epsilon_high) & (advantages > 0)
+            is_clipped = (is_lower_clipped & (advantages.unsqueeze(1) < 0)) | (
+                is_upper_clipped & (advantages.unsqueeze(1) > 0)
             )
-            is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
+            is_clipped = is_clipped.expand_as(attention_mask)
 
         metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
         return loss, metrics
@@ -160,7 +178,7 @@ def forward(
             ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
             ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
             beta (float): Weight for the KL penalty
-            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo", "cispo"). Defaults to "dapo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits
@@ -251,7 +269,9 @@ def __init__(
             chunk_size (int): Size of chunks for processing.
             epsilon_low (float): Lower bound for the importance sampling ratio.
             epsilon_high (float): Upper bound for the importance sampling ratio.
-            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo", "cispo").
+                Defaults to "dapo". For "cispo", epsilon_high is typically larger (e.g. 5.0) and
+                epsilon_low is unused.
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits.
diff --git a/src/liger_kernel/transformers/grpo_loss.py b/src/liger_kernel/transformers/grpo_loss.py
@@ -22,12 +22,14 @@ def triton_grpo_loss(
     reduce=False,
 ):
     assert logits is not None and completion_ids is not None and advantages is not None, (
-        "must provide logits、completion_ids and advantages"
+        "must provide logits, completion_ids and advantages"
     )
     if importance_sampling_level != "token":
         raise ValueError(
             f"Triton GRPO loss only supports token-level importance sampling. Got {importance_sampling_level}."
         )
+    if loss_type == "cispo":
+        raise ValueError("Triton GRPO loss does not support loss_type='cispo'. Use the chunked GRPO loss path.")
 
     per_token_loss, per_token_kl, is_clipped = GrpoLossFunction.apply(
         logits,
diff --git a/test/chunked_loss/test_grpo_loss.py b/test/chunked_loss/test_grpo_loss.py
@@ -58,6 +58,7 @@ def compute_per_token_components(
         epsilon_high,
         beta,
         importance_sampling_level,
+        loss_type: str = "grpo",
     ):
         attention_mask = attention_mask.to(per_token_logps.dtype)
         old_per_token_logps = (
@@ -77,28 +78,43 @@ def compute_per_token_components(
             )
 
         coef_1 = torch.exp(log_importance_weights)
-        coef_2 = torch.clamp(coef_1, 1 - epsilon_low, 1 + epsilon_high)
         expanded_advantages = advantages.unsqueeze(1)
-        per_token_loss1 = coef_1 * expanded_advantages
-        per_token_loss2 = coef_2 * expanded_advantages
-        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        # Compute clipped coefficients and clipping flags
+        if loss_type == "cispo":
+            # CISPO: clip and detach the importance weights
+            upper_bound = epsilon_high
+            lower_bound = None
+            coef_2 = torch.clamp(coef_1, lower_bound, upper_bound).detach()
+            is_lower_clipped = False
+            is_upper_clipped = coef_1 > upper_bound
+        else:
+            upper_bound = 1 + epsilon_high
+            lower_bound = 1 - epsilon_low
+            coef_2 = torch.clamp(coef_1, lower_bound, upper_bound)
+            is_lower_clipped = coef_1 < lower_bound
+            is_upper_clipped = coef_1 > upper_bound
+
+        if loss_type == "cispo":
+            # CISPO: clip and detach the importance weights, multiply by log probs
+            # Reference: https://github.com/huggingface/trl/blob/035c3ff151b953ca72cdfe0ee966bc1469a26fde/trl/trainer/grpo_trainer.py#L2030
+            per_token_loss = -coef_2 * expanded_advantages * per_token_logps
+        else:
+            per_token_loss1 = coef_1 * expanded_advantages
+            per_token_loss2 = coef_2 * expanded_advantages
+            per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
         kl_div = None
         if beta != 0.0:
             ref_per_token_logps = ref_per_token_logps.float()
             kl_div = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1.0
             per_token_loss = per_token_loss + beta * kl_div
 
+        # Adjust clipping metric calculation based on importance sampling level
         if importance_sampling_level == "token":
-            is_clipped = ((coef_1 < 1 - epsilon_low) & (expanded_advantages < 0)) | (
-                (coef_1 > 1 + epsilon_high) & (expanded_advantages > 0)
-            )
+            is_clipped = (is_lower_clipped & (expanded_advantages < 0)) | (is_upper_clipped & (expanded_advantages > 0))
         else:  # sequence level
             # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
-            seq_advantages = advantages
-            is_clipped = ((coef_1.squeeze(-1) < 1 - epsilon_low) & (seq_advantages < 0)) | (
-                (coef_1.squeeze(-1) > 1 + epsilon_high) & (seq_advantages > 0)
-            )
-            is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
+            is_clipped = (is_lower_clipped & (expanded_advantages < 0)) | (is_upper_clipped & (expanded_advantages > 0))
+            is_clipped = is_clipped.expand_as(attention_mask)
         return per_token_loss, kl_div, is_clipped
 
     def forward(
@@ -148,6 +164,7 @@ def forward(
             self.epsilon_high,
             self.beta,
             self.importance_sampling_level,
+            self.loss_type,
         )
 
         # Apply masking and calculate loss based on loss_type
@@ -160,6 +177,9 @@ def forward(
         elif self.loss_type == "dapo":
             normalizer = attention_mask.sum().clamp(min=1.0)
             loss = (per_token_loss * attention_mask).sum() / normalizer
+        elif self.loss_type == "cispo":
+            normalizer = attention_mask.sum().clamp(min=1.0)
+            loss = (per_token_loss * attention_mask).sum() / normalizer
         else:
             raise ValueError(f"Unknown loss type: {self.loss_type}")
 
@@ -259,7 +279,7 @@ def forward(
         (False, False, True),
     ],
 )
-@pytest.mark.parametrize("loss_type", ["bnpo", "grpo", "dr_grpo", "dapo"])
+@pytest.mark.parametrize("loss_type", ["bnpo", "grpo", "dr_grpo", "dapo", "cispo"])
 @pytest.mark.parametrize("importance_sampling_level", ["token", "sequence"])
 def test_correctness(
     B,
@@ -565,7 +585,7 @@ def test_reduce_grpo_loss_matches_reference(loss_type):
         expected = (per_token_loss * mask_f).sum() / mask_f.sum().clamp(min=1.0)
     elif loss_type == "dr_grpo":
         expected = (per_token_loss * mask_f).sum() / (per_token_loss.size(0) * max_completion_length)
-    else:  # dapo
+    else:  # dapo/cispo
         expected = (per_token_loss * mask_f).sum() / mask_f.sum().clamp(min=1.0)
 
     assert_verbose_allclose(reduced, expected)