Add vLLM importance sampling ratio support for GRPO loss (#1088)

yukiu00 · web-flow · commit cb8e4082b958 · 2026-02-10T00:45:33.000+08:00
## Summary Fixes the **primary cause** (item 1) of #1082 — `LigerFusedLinearGRPOLoss` produces ~100x larger `grad_norm` than TRL's non-Liger path when using vLLM. **Root cause:** TRL's `GRPOTrainer` applies `per_token_loss *= importance_sampling_ratio` ([source](https://github.com/huggingface/trl/blob/v0.27.2/trl/trainer/grpo_trainer.py#L2351-L2352)) to correct for distribution mismatch from vLLM's rejection/stratified sampling. Liger-Kernel had no mechanism to accept or apply this correction, so the IS ratio was silently ignored, resulting in uncorrected (and much larger) gradients. **This is a high-priority fix** — any user running `GRPOTrainer` with `use_vllm=True` and `use_liger_kernel=True` is affected, and the resulting ~100x gradient mismatch can cause training instability or divergence. ### Changes - Add optional `vllm_is_ratio` parameter (`[B, T]` tensor or `None`) to both code paths: - **Chunked loss path**: `LigerFusedLinearGRPOLoss`, `LigerFusedLinearGRPOFunction`, `ppo_loss_fn`, and the base class `LigerFusedLinearPPOBase` chunking pipeline - **Triton kernel path**: `triton_grpo_loss`, `GrpoLossFunction`, and the Triton fwd/bwd kernels (`_grpo_loss_fwd_kernel`, `_grpo_loss_bwd_kernel`) - The IS correction is applied **after** PPO clipped loss computation and **before** KL penalty, matching TRL's behavior exactly - `vllm_is_ratio=None` (default) preserves existing behavior — no breaking changes - Works with all loss types: `grpo`, `dapo`, `bnpo`, `dr_grpo`, `cispo`, `sapo` ### Verification With `IS_RATIO=0.01`, the `grad_norm` ratio matches exactly: ``` Chunked loss path: grad_norm WITHOUT vllm_is_ratio: 1.052219e-01 grad_norm WITH vllm_is_ratio: 1.052219e-03 ratio: 0.010000 ✓ Triton path: grad_norm WITHOUT vllm_is_ratio: 1.461673e-02 grad_norm WITH vllm_is_ratio: 1.461673e-04 ratio: 0.010000 ✓ ``` ## Test plan - [x] Extended existing `test_correctness` in `test/chunked_loss/test_grpo_loss.py` with `use_vllm_is_ratio` parametrize — covers all 6 loss types × 2 IS levels × 2 beta values × with/without vllm_is_ratio - [x] Added `test_grpo_loss_with_vllm_is_ratio` in `test/transformers/test_grpo_loss.py` — compares Triton output against PyTorch reference with IS correction, plus `vllm_is_ratio=None` == `vllm_is_ratio=ones` identity check - [x] All existing tests continue to pass (no regressions) - [x] `make checkstyle` passes ## Related - Reference implementation: #993 - Issue: #1082
diff --git a/src/liger_kernel/chunked_loss/fused_linear_ppo.py b/src/liger_kernel/chunked_loss/fused_linear_ppo.py
@@ -41,6 +41,7 @@ def forward(
         chunk_size=1,
         sapo_temperature_pos=1.0,
         sapo_temperature_neg=1.05,
+        vllm_is_ratio=None,
     ):
         # TODO: check torch compile matmul
         """Chunked forward pass for PPO loss computation.
@@ -71,6 +72,8 @@ def forward(
             chunk_size: Size of chunks for processing in other loss modules
             sapo_temperature_pos: Temperature for positive advantages in SAPO
             sapo_temperature_neg: Temperature for negative advantages in SAPO
+            vllm_is_ratio: vLLM importance sampling ratio tensor (batch_size, seq_len) or (batch_size, 1) or None.
+                Used to correct for distribution mismatch when using vLLM for generation.
         """
         if use_ref_model:
             assert ref_per_token_logps is not None or ref_input is not None, (
@@ -80,6 +83,20 @@ def forward(
                 raise Warning("Both ref_per_token_logps and ref_input are provided. Using ref_per_token_logps.")
         if loss_type == "dr_grpo":
             assert max_completion_length is not None, "max_completion_length must be provided for loss_type 'dr_grpo'"
+        if vllm_is_ratio is not None:
+            B, T = attention_mask.shape
+            assert vllm_is_ratio.dim() in (1, 2), (
+                f"vllm_is_ratio must be 1D (B,) or 2D (B, T) / (B, 1), got {vllm_is_ratio.dim()}D"
+            )
+            if vllm_is_ratio.dim() == 2:
+                assert vllm_is_ratio.shape[0] == B and vllm_is_ratio.shape[1] in (1, T), (
+                    f"vllm_is_ratio shape must be ({B}, 1) or ({B}, {T}), got {tuple(vllm_is_ratio.shape)}"
+                )
+            else:
+                assert vllm_is_ratio.shape[0] == B, (
+                    f"vllm_is_ratio shape must be ({B},), got {tuple(vllm_is_ratio.shape)}"
+                )
+                vllm_is_ratio = vllm_is_ratio.unsqueeze(-1)  # (B,) -> (B, 1) for broadcasting
         # Initialize accumulators
         loss_acc = torch.zeros((), device=_input.device, dtype=torch.float32)
         grad_weight = torch.zeros_like(weight)  # [V, H]
@@ -114,6 +131,7 @@ def fused_fwd_bwd(
             ref_per_token_logps_chunk,
             old_per_token_logps_chunk,
             ref_input_chunk,
+            vllm_is_ratio_chunk,
         ):
             """Fused forward and backward for a chunk."""
             argnums = (0, 1, 5) if bias is not None else (0, 1)
@@ -127,6 +145,7 @@ def fused_fwd_bwd(
                 ref_per_token_logps_chunk=ref_per_token_logps_chunk,  # arg 6
                 old_per_token_logps_chunk=old_per_token_logps_chunk,  # arg 7
                 ref_input_chunk=ref_input_chunk,  # arg 8
+                vllm_is_ratio_chunk=vllm_is_ratio_chunk,  # arg 9
             )
 
         def accumulate_chunk(
@@ -137,6 +156,7 @@ def accumulate_chunk(
             ref_per_token_logps_chunk=None,
             old_per_token_logps_chunk=None,
             ref_input_chunk=None,
+            vllm_is_ratio_chunk=None,
         ):
             (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
                 input_chunk,
@@ -146,6 +166,7 @@ def accumulate_chunk(
                 ref_per_token_logps_chunk,
                 old_per_token_logps_chunk,
                 ref_input_chunk,
+                vllm_is_ratio_chunk,
             )
             if bias is not None:
                 grad_bias.add_(chunk_grad_bias[0])
@@ -196,6 +217,9 @@ def accumulate_chunk(
             if use_ref_model and ref_per_token_logps is None
             else [None] * chunks
         )
+        _vllm_is_ratio_chunks = (
+            torch.chunk(vllm_is_ratio, chunks=chunks, dim=0) if vllm_is_ratio is not None else [None] * chunks
+        )
 
         for (
             input_chunk,
@@ -205,6 +229,7 @@ def accumulate_chunk(
             ref_per_token_logps_chunk,
             old_per_token_logps_chunk,
             ref_input_chunk,
+            vllm_is_ratio_chunk,
         ) in zip(
             _input_chunks,
             _selected_token_ids_chunks,
@@ -213,6 +238,7 @@ def accumulate_chunk(
             _ref_per_token_logps_chunks,
             _old_per_token_logps_chunks,
             _ref_input_chunks,
+            _vllm_is_ratio_chunks,
         ):
             # Mark dynamic dimensions
             torch._dynamo.mark_dynamic(input_chunk, 1)
@@ -224,6 +250,8 @@ def accumulate_chunk(
                 torch._dynamo.mark_dynamic(ref_input_chunk, 1)
             if old_per_token_logps_chunk is not None:
                 torch._dynamo.mark_dynamic(old_per_token_logps_chunk, 1)
+            if vllm_is_ratio_chunk is not None:
+                torch._dynamo.mark_dynamic(vllm_is_ratio_chunk, 1)
 
             accumulate_chunk(
                 input_chunk,
@@ -233,6 +261,7 @@ def accumulate_chunk(
                 ref_per_token_logps_chunk,
                 old_per_token_logps_chunk,
                 ref_input_chunk,
+                vllm_is_ratio_chunk,
             )
 
         # Combine gradients
@@ -277,6 +306,7 @@ def _compute_chunk_loss(
         ref_per_token_logps_chunk=None,
         old_per_token_logps_chunk=None,
         ref_input_chunk=None,
+        vllm_is_ratio_chunk=None,
         ref_weight=None,
         ref_bias=None,
         full_attention_mask=None,
@@ -322,6 +352,7 @@ def _compute_chunk_loss(
             importance_sampling_level=importance_sampling_level,
             sapo_temperature_pos=sapo_temperature_pos,
             sapo_temperature_neg=sapo_temperature_neg,
+            vllm_is_ratio=vllm_is_ratio_chunk,
         )
 
         return chunk_loss, chunk_metrics
@@ -376,4 +407,5 @@ def backward(ctx, grad_output, *grad_metrics):
             None,  # grad_chunk_size
             None,  # grad_sapo_temperature_pos
             None,  # grad_sapo_temperature_neg
+            None,  # grad_vllm_is_ratio
         )
diff --git a/src/liger_kernel/chunked_loss/grpo_loss.py b/src/liger_kernel/chunked_loss/grpo_loss.py
@@ -75,6 +75,7 @@ def ppo_loss_fn(
         importance_sampling_level="token",  # ["token", "sequence"] - new parameter for GSPO
         sapo_temperature_pos=1.0,  # Temperature for positive advantages in SAPO
         sapo_temperature_neg=1.05,  # Temperature for negative advantages in SAPO
+        vllm_is_ratio=None,  # vLLM importance sampling ratio (chunk_size, seq_len) or (chunk_size, 1) or None
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -138,6 +139,10 @@ def ppo_loss_fn(
             per_token_loss2 = coef_2 * advantages.unsqueeze(1)
             per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
 
+        # Apply vLLM importance sampling correction BEFORE adding KL penalty
+        if vllm_is_ratio is not None:
+            per_token_loss = per_token_loss * vllm_is_ratio
+
         if beta != 0.0:
             # Compute KL penalty (approximates KL[per_token_logps, ref_per_token_logps])
             kl_div = k3_loss_fn(ref_per_token_logps, per_token_logps)
@@ -214,6 +219,7 @@ def forward(
         compiled=True,
         use_ref_model=True,
         chunk_size=1,
+        vllm_is_ratio=None,
     ):
         """
         Fused linear layer with GRPO loss.
@@ -239,6 +245,8 @@ def forward(
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
             chunk_size (int): Size of chunks for processing.
+            vllm_is_ratio (torch.Tensor, optional): vLLM importance sampling ratio (batch_size, seq_len) or (batch_size, 1) or None.
+                Used to correct for distribution mismatch when using vLLM for generation.
         Returns:
             torch.Tensor: Computed loss
         """
@@ -268,6 +276,7 @@ def forward(
             importance_sampling_level=importance_sampling_level,
             sapo_temperature_pos=sapo_temperature_pos,
             sapo_temperature_neg=sapo_temperature_neg,
+            vllm_is_ratio=vllm_is_ratio,
         )
 
     @staticmethod
@@ -300,6 +309,7 @@ def backward(ctx, grad_output, *grad_metrics):
             None,  # grad_compiled
             None,  # grad_use_ref_model
             None,  # grad_chunk_size
+            None,  # grad_vllm_is_ratio
         )
 
 
@@ -370,6 +380,7 @@ def forward(
         ref_input=None,
         ref_weight=None,
         ref_bias=None,
+        vllm_is_ratio=None,
     ):
         return LigerFusedLinearGRPOFunction.apply(
             _input,
@@ -395,4 +406,5 @@ def forward(
             self.compiled,
             self.use_ref_model,
             self.chunk_size,
+            vllm_is_ratio,
         )
diff --git a/src/liger_kernel/ops/grpo_loss.py b/src/liger_kernel/ops/grpo_loss.py
@@ -90,6 +90,8 @@ def _grpo_loss_fwd_kernel(
     INPUT_IDS,
     COMPLETION_MASK,
     ADVANTAGES,
+    VLLM_IS_RATIO,
+    VLLM_IS_RATIO_STRIDE,
     LOSS,
     LSE,
     KL,
@@ -169,6 +171,14 @@ def _grpo_loss_fwd_kernel(
         per_token_loss = -sapo_coef * advantage
         is_clipped = 0.0  # SAPO has no clipping concept
 
+    # Apply vLLM importance sampling correction BEFORE adding KL penalty
+    if VLLM_IS_RATIO is not None:
+        # Use modulo to support both (B, L) per-token and (B, 1) per-sequence shapes
+        vllm_is_ratio = tl.load(VLLM_IS_RATIO + off_b * VLLM_IS_RATIO_STRIDE + off_l % VLLM_IS_RATIO_STRIDE).to(
+            tl.float32
+        )
+        per_token_loss = per_token_loss * vllm_is_ratio
+
     if BETA != 0.0:
         REF_LOGP += off_b * L + off_l
         KL += off_b * L + off_l
@@ -198,6 +208,8 @@ def _grpo_loss_bwd_kernel(
     ADVANTAGES,
     COMPLETION_MASK,
     LSE,
+    VLLM_IS_RATIO,
+    VLLM_IS_RATIO_STRIDE,
     TEMPERATURE,
     BETA: tl.constexpr,
     EPS_LOW,
@@ -271,6 +283,14 @@ def _grpo_loss_bwd_kernel(
         d_sapo_d_coef1 = 4.0 * sigmoid_val * (1.0 - sigmoid_val)
         dlogp = -advantage * d_sapo_d_coef1 * coef_1
 
+    # Apply vLLM IS ratio to PPO gradient (before KL gradient)
+    if VLLM_IS_RATIO is not None:
+        # Use modulo to support both (B, L) per-token and (B, 1) per-sequence shapes
+        vllm_is_ratio = tl.load(VLLM_IS_RATIO + off_b * VLLM_IS_RATIO_STRIDE + off_l % VLLM_IS_RATIO_STRIDE).to(
+            tl.float32
+        )
+        dlogp = dlogp * vllm_is_ratio
+
     if BETA != 0.0:
         REF_LOGP += off_b * L + off_l
         ref_logp = tl.load(REF_LOGP).to(tl.float32)
@@ -304,6 +324,7 @@ def forward(
         loss_type="grpo",
         sapo_temperature_pos=1.0,
         sapo_temperature_neg=1.05,
+        vllm_is_ratio=None,
     ):
         assert logits.is_contiguous() and completion_ids.is_contiguous()
         assert old_logp is None or old_logp.is_contiguous()
@@ -329,6 +350,25 @@ def forward(
         if completion_mask is not None:
             assert completion_mask.is_contiguous()
 
+        # Handle vLLM IS ratio
+        vllm_is_ratio_ptr = None
+        vllm_is_ratio_stride = L  # default to per-token (unused when ptr is None)
+        if vllm_is_ratio is not None:
+            assert vllm_is_ratio.dim() in (1, 2), (
+                f"vllm_is_ratio must be 1D (B,) or 2D (B, L) / (B, 1), got {vllm_is_ratio.dim()}D"
+            )
+            if vllm_is_ratio.dim() == 2:
+                assert vllm_is_ratio.shape[0] == B and vllm_is_ratio.shape[1] in (1, L), (
+                    f"vllm_is_ratio shape must be ({B}, 1) or ({B}, {L}), got {tuple(vllm_is_ratio.shape)}"
+                )
+            else:
+                assert vllm_is_ratio.shape[0] == B, (
+                    f"vllm_is_ratio shape must be ({B},), got {tuple(vllm_is_ratio.shape)}"
+                )
+            vllm_is_ratio = vllm_is_ratio.contiguous()
+            vllm_is_ratio_ptr = vllm_is_ratio
+            vllm_is_ratio_stride = vllm_is_ratio.shape[1] if vllm_is_ratio.dim() > 1 else 1
+
         loss = torch.zeros(B, L, device=logits.device, dtype=torch.float32)
         lse = torch.zeros_like(loss)
         is_clipped = torch.zeros_like(loss)
@@ -341,6 +381,8 @@ def forward(
             completion_ids,
             completion_mask,
             advantages,
+            vllm_is_ratio_ptr,
+            vllm_is_ratio_stride,
             loss,
             lse,
             kl,
@@ -357,6 +399,8 @@ def forward(
             **kwargs,
         )
         ctx.save_for_backward(logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse)
+        ctx.vllm_is_ratio = vllm_is_ratio_ptr
+        ctx.vllm_is_ratio_stride = vllm_is_ratio_stride
         ctx.infos = (
             temperature,
             beta,
@@ -376,6 +420,8 @@ def backward(ctx, *args):
         temperature, beta, eps_low, eps_high, inplace, loss_type_int, sapo_temperature_pos, sapo_temperature_neg = (
             ctx.infos
         )
+        vllm_is_ratio = ctx.vllm_is_ratio
+        vllm_is_ratio_stride = ctx.vllm_is_ratio_stride
         B, L_ADD_1, N = logits.shape
         L = L_ADD_1 - 1
         dlogits = logits.data if inplace else torch.empty_like(logits)
@@ -390,6 +436,8 @@ def backward(ctx, *args):
             advantages,
             completion_mask,
             lse,
+            vllm_is_ratio,
+            vllm_is_ratio_stride,
             temperature,
             beta,
             eps_low,
@@ -404,5 +452,6 @@ def backward(ctx, *args):
         )
         dlogits[:, -1, :] = 0
         # Return None for: old_logp, ref_logp, completion_ids, advantages, completion_mask,
-        # temperature, beta, eps_low, eps_high, inplace, loss_type, sapo_temperature_pos, sapo_temperature_neg
-        return dlogits, None, None, None, None, None, None, None, None, None, None, None, None, None
+        # temperature, beta, eps_low, eps_high, inplace, loss_type, sapo_temperature_pos, sapo_temperature_neg,
+        # vllm_is_ratio
+        return dlogits, None, None, None, None, None, None, None, None, None, None, None, None, None, None
diff --git a/src/liger_kernel/transformers/grpo_loss.py b/src/liger_kernel/transformers/grpo_loss.py
@@ -22,6 +22,7 @@ def triton_grpo_loss(
     reduce=False,
     sapo_temperature_pos=1.0,
     sapo_temperature_neg=1.05,
+    vllm_is_ratio=None,
 ):
     assert logits is not None and completion_ids is not None and advantages is not None, (
         "must provide logits, completion_ids and advantages"
@@ -46,6 +47,7 @@ def triton_grpo_loss(
         loss_type,
         sapo_temperature_pos,
         sapo_temperature_neg,
+        vllm_is_ratio,
     )
     if not reduce:
         return per_token_loss, per_token_kl, is_clipped
diff --git a/test/chunked_loss/test_grpo_loss.py b/test/chunked_loss/test_grpo_loss.py
diff --git a/test/transformers/test_grpo_loss.py b/test/transformers/test_grpo_loss.py