add selected token ids and functional tests

shivam15s · shivam15s · commit 10078c286af9 · 2025-03-27T17:30:24.000Z
diff --git a/src/liger_kernel/chunked_loss/functional.py b/src/liger_kernel/chunked_loss/functional.py
@@ -4,10 +4,12 @@
 from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOFunction
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOFunction
 from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
+from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
 
 liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
 liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
 liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
 liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
 liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
 liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply
+liger_fused_linear_grpo = LigerFusedLinearGRPOFunction.apply
diff --git a/src/liger_kernel/chunked_loss/fused_linear_rlhf.py b/src/liger_kernel/chunked_loss/fused_linear_rlhf.py
@@ -20,6 +20,7 @@ def forward(
         ctx,
         _input,
         weight,
+        selected_token_ids,
         attention_mask,
         advantages,
         bias=None,
@@ -29,7 +30,7 @@ def forward(
         old_per_token_logps=None,
         epsilon_low=0.2,
         epsilon_high=0.2,
-        beta=0.1,
+        beta=0.04,
         temperature=1.0,
         compiled=True,
         use_ref_model=False,
@@ -42,6 +43,7 @@ def forward(
             ctx: Context for backward
             _input: Input tensor
             weight: Weight tensor
+            selected_token_ids: Selected token ids tensor
             attention_mask: Attention mask tensor
             advantages: Advantages tensor
             bias: Bias tensor
@@ -78,22 +80,23 @@ def forward(
             rlhf_loss_fn=cls.rlhf_loss_fn,
         )
 
-        def fused_fwd_bwd(input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk):
+        def fused_fwd_bwd(input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk):
             """Fused forward and backward for a chunk."""
-            argnums = (0, 1, 4) if bias is not None else (0, 1)
+            argnums = (0, 1, 5) if bias is not None else (0, 1)
             return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=True)(
                 input_chunk,  # arg 0
                 weight,  # arg 1
-                attention_mask_chunk,  # arg 2
-                advantages_chunk,  # arg 3
-                bias,  # arg 4
-                ref_input_chunk=ref_input_chunk,  # arg 5
-                old_per_token_logps_chunk=old_per_token_logps_chunk,  # arg 6
+                selected_token_ids_chunk,  # arg 2
+                attention_mask_chunk,  # arg 3
+                advantages_chunk,  # arg 4
+                bias,  # arg 5
+                ref_input_chunk=ref_input_chunk,  # arg 6
+                old_per_token_logps_chunk=old_per_token_logps_chunk,  # arg 7
             )
 
-        def accumulate_chunk(input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk=None, old_per_token_logps_chunk=None):
+        def accumulate_chunk(input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk=None, old_per_token_logps_chunk=None):
             (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
-                input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk
+                input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk
             )
             if bias is not None:
                 grad_bias.add_(chunk_grad_bias[0])
@@ -102,7 +105,6 @@ def accumulate_chunk(input_chunk, attention_mask_chunk, advantages_chunk, ref_in
             grad_weight.add_(chunk_grad_weight)
             grad_inputs.append(chunk_grad_input)
             loss_acc.add_(chunk_loss)
-
             # Initialize storage for metrics on first chunk
             if len(aggregated_metrics) == 0:
                 for metric in chunk_metrics:
@@ -126,16 +128,18 @@ def accumulate_chunk(input_chunk, attention_mask_chunk, advantages_chunk, ref_in
         # Process input in chunks based on chunk_size
         chunks = max(1, _input.shape[0] // chunk_size)
         _input_chunks = torch.chunk(_input, chunks=chunks, dim=0)
+        _selected_token_ids_chunks = torch.chunk(selected_token_ids, chunks=chunks, dim=0)
         _attention_mask_chunks = torch.chunk(attention_mask, chunks=chunks, dim=0)
         _advantages_chunks = torch.chunk(advantages, chunks=chunks, dim=0)
         _ref_input_chunks = torch.chunk(ref_input, chunks=chunks, dim=0) if use_ref_model else [None] * chunks
         _old_per_token_logps_chunks = torch.chunk(old_per_token_logps, chunks=chunks, dim=0) if old_per_token_logps is not None else [None] * chunks
 
-        for input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk in zip(
-            _input_chunks, _attention_mask_chunks, _advantages_chunks, _ref_input_chunks, _old_per_token_logps_chunks
+        for input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk in zip(
+            _input_chunks, _selected_token_ids_chunks, _attention_mask_chunks, _advantages_chunks, _ref_input_chunks, _old_per_token_logps_chunks
         ):
             # Mark dynamic dimensions
             torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(selected_token_ids_chunk, 1)
             torch._dynamo.mark_dynamic(attention_mask_chunk, 1)
             if use_ref_model:
                 torch._dynamo.mark_dynamic(ref_input_chunk, 1)
@@ -146,7 +150,7 @@ def accumulate_chunk(input_chunk, attention_mask_chunk, advantages_chunk, ref_in
             else:
                 old_per_token_logps_chunk = None
 
-            accumulate_chunk(input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk)
+            accumulate_chunk(input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk)
 
         # Combine gradients
         grad_input = torch.cat(grad_inputs, dim=0)
@@ -168,6 +172,7 @@ def accumulate_chunk(input_chunk, attention_mask_chunk, advantages_chunk, ref_in
     def _compute_chunk_loss(
         input_chunk,
         weight,
+        selected_token_ids_chunk,
         attention_mask_chunk,
         advantages_chunk,
         bias=None,
@@ -178,7 +183,7 @@ def _compute_chunk_loss(
         full_attention_mask=None,
         epsilon_low=0.2,
         epsilon_high=0.2,
-        beta=0.1,
+        beta=0.04,
         temperature=1.0,
         use_ref_model=False,
         rlhf_loss_fn=None,
@@ -196,6 +201,7 @@ def _compute_chunk_loss(
         # Compute chunk loss and metrics using the provided loss function
         chunk_loss, chunk_metrics = rlhf_loss_fn(
             log_probs=log_probs,
+            selected_token_ids=selected_token_ids_chunk,
             attention_mask=attention_mask_chunk,
             advantages=advantages_chunk,
             full_attention_mask=full_attention_mask,
@@ -236,6 +242,7 @@ def backward(ctx, grad_output, *grad_metrics):
         return (
             grad_input,
             grad_weight,
+            None,  # grad_selected_token_ids
             None,  # grad_attention_mask
             None,  # grad_advantages
             grad_bias,
diff --git a/test/chunked_loss/test_grpo_loss.py b/test/chunked_loss/test_grpo_loss.py
@@ -2,7 +2,9 @@
 import torch
 import torch.nn.functional as F
 
+from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
 from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
+from liger_kernel.chunked_loss.functional import liger_fused_linear_grpo
 from liger_kernel.utils import infer_device
 from test.utils import assert_verbose_allclose
 from test.utils import set_seed
@@ -40,6 +42,7 @@ def __init__(
     def forward(
         self,
         x,  # Shape: [batch_size, seq_len, hidden_size]
+        selected_token_ids,  # Shape: [batch_size, seq_len]
         attention_mask,  # Shape: [batch_size, seq_len]
         advantages,  # Shape: [batch_size,]
         ref_input=None,  # Shape: [batch_size, seq_len, hidden_size]
@@ -54,8 +57,7 @@ def forward(
         log_probs = F.log_softmax(logits, dim=-1)
 
         # Get chosen token probabilities
-        chosen_tokens = log_probs.argmax(dim=-1)
-        chosen_token_logprobs = log_probs.gather(dim=-1, index=chosen_tokens.unsqueeze(-1)).squeeze(-1)
+        per_token_logps = log_probs.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(-1)
 
         # Get reference model probabilities
         if self.use_ref_model:
@@ -66,22 +68,21 @@ def forward(
                 if self.temperature != 1.0:
                     ref_logits = ref_logits / self.temperature
                 ref_log_probs = F.log_softmax(ref_logits, dim=-1)
-                ref_token_logprobs = ref_log_probs.gather(dim=-1, index=chosen_tokens.unsqueeze(-1)).squeeze(-1)
+                ref_per_token_logps = ref_log_probs.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(-1)
         else:
-            ref_token_logprobs = chosen_token_logprobs.detach()
-
+            ref_per_token_logps = per_token_logps.detach()
 
         # Compute policy gradient loss with importance sampling ratio
-        old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else chosen_token_logprobs.detach()
-        coef_1 = torch.exp(chosen_token_logprobs - old_per_token_logps)
+        old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
+        coef_1 = torch.exp(per_token_logps - old_per_token_logps)
         coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
         per_token_loss1 = coef_1 * advantages.unsqueeze(1)
         per_token_loss2 = coef_2 * advantages.unsqueeze(1)
         per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
         if self.beta != 0.0:
             # Compute KL divergence between model and reference model
             kl_div = (
-                torch.exp(ref_token_logprobs - chosen_token_logprobs) - (ref_token_logprobs - chosen_token_logprobs) - 1.0
+                torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1.0
             )
             per_token_loss = per_token_loss + self.beta * kl_div
 
@@ -90,7 +91,7 @@ def forward(
 
         # Compute metrics
         metrics = [
-            chosen_token_logprobs.mean(),
+            per_token_logps.mean(),
             log_probs.mean(),
         ]
         if self.beta != 0.0:
@@ -118,16 +119,18 @@ def __init__(
         super().__init__()
         self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
         self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
-        self.grpo_loss = LigerFusedLinearGRPOFunction.apply
-        self.beta = beta
-        self.epsilon_low = epsilon_low
-        self.epsilon_high = epsilon_high
-        self.temperature = temperature
-        self.use_ref_model = use_ref_model
+        self.grpo_loss = LigerFusedLinearGRPOLoss(
+            beta=beta,
+            epsilon_low=epsilon_low,
+            epsilon_high=epsilon_high,
+            temperature=temperature,
+            use_ref_model=use_ref_model,
+        )
 
     def forward(
         self,
         x,
+        selected_token_ids,
         attention_mask,
         advantages,
         ref_input=None,
@@ -137,19 +140,14 @@ def forward(
         return self.grpo_loss(
             x,  # _input
             self.lin.weight,  # weight
+            selected_token_ids,  # selected_token_ids
             attention_mask,  # attention_mask
             advantages,  # advantages
             self.lin.bias,  # bias
             ref_input,  # ref_input
             self.ref_lin.weight,  # ref_weight
             self.ref_lin.bias,  # ref_bias
             old_per_token_logps,  # old_per_token_logps
-            self.beta,  # beta
-            self.epsilon_low,  # epsilon_low
-            self.epsilon_high,  # epsilon_high
-            self.temperature,  # temperature
-            True,  # compiled
-            self.use_ref_model,  # use_ref_model
         )
 
 
@@ -173,7 +171,7 @@ def forward(
     "beta, epsilon_low, epsilon_high, temperature",
     [
         # Standard settings
-        (0.1, 0.2, 0.2, 1.0),
+        (0.1, 0.2, 0.2, 20.0), # set temperature to 20.0 for better numerical stability
         (0.0, 0.1, 0.1, 2.0),
     ]
 )
@@ -240,6 +238,9 @@ def test_correctness(
     input1 = _input.detach().clone().requires_grad_(True)
     input2 = _input.detach().clone().requires_grad_(True)
 
+    # Create selected token ids with shape [B, T]
+    selected_token_ids = torch.randint(0, V, (B, T), device=device)
+
     # Create attention mask with random padding [B, T]
     attention_mask = torch.ones(B, T, device=device)
     num_elements_to_mask = torch.randint(1, B * T // 2, (1,)).item()
@@ -259,13 +260,15 @@ def test_correctness(
 
     # Forward pass with reference model
     loss1, aux1 = torch_lm_head_grpo(
-        input1, attention_mask, advantages, ref_input=ref_input, old_per_token_logps=old_per_token_logps
+        input1, selected_token_ids, attention_mask, advantages, ref_input=ref_input, old_per_token_logps=old_per_token_logps
     )
     loss2, aux2 = liger_lm_head_grpo(
-        input2, attention_mask, advantages, ref_input=ref_input, old_per_token_logps=old_per_token_logps
+        input2, selected_token_ids, attention_mask, advantages, ref_input=ref_input, old_per_token_logps=old_per_token_logps
     )
 
     # Check losses match
+    assert loss1 != float('nan')
+    assert loss2 != float('nan')
     assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
 
     # Check metrics match
@@ -292,3 +295,100 @@ def test_correctness(
             atol=atol,
             rtol=rtol,
         )
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-2),
+        (1.0, torch.float32, 1e-4, 5e-3),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ref_bias", [True, False])
+@pytest.mark.parametrize(
+    "beta, epsilon_low, epsilon_high, temperature",
+    [
+        # Standard settings
+        (0.1, 0.2, 0.2, 20.0), # set temperature to 20.0 for better numerical stability
+        (0.0, 0.1, 0.1, 2.0),
+    ]
+)
+@pytest.mark.parametrize("use_ref_model", [True, False])
+@pytest.mark.parametrize("old_per_token_logps", [True, False])
+def test_functional_correctness(
+    B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias, beta, epsilon_low, epsilon_high, temperature, use_ref_model, old_per_token_logps
+):
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype) * scalar
+    weight1 = _weight.detach().clone().requires_grad_(True)
+    weight2 = _weight.detach().clone().requires_grad_(True)
+
+    selected_token_ids = torch.randint(0, V, (B, T), device=device)
+    
+    attention_mask = torch.ones(B, T, device=device)
+
+    advantages = torch.rand(B, device=device, dtype=dtype)
+
+    if bias:
+        _bias = torch.randn(V, device=device, dtype=dtype) * scalar
+        bias1 = _bias.detach().clone().requires_grad_(True)
+        bias2 = _bias.detach().clone().requires_grad_(True)
+    else:
+        bias1 = None
+        bias2 = None
+
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    
+    _ref_weight = torch.randn(V, H, device=device, dtype=dtype) * scalar
+    ref_weight1 = _ref_weight.detach().clone().requires_grad_(True)
+    ref_weight2 = _ref_weight.detach().clone().requires_grad_(True)
+
+    if ref_bias:
+        _ref_bias = torch.randn(V, device=device, dtype=dtype) * scalar
+        ref_bias1 = _ref_bias.detach().clone().requires_grad_(True)
+        ref_bias2 = _ref_bias.detach().clone().requires_grad_(True)
+    else:
+        ref_bias1 = None
+        ref_bias2 = None
+
+    if old_per_token_logps:
+        old_per_token_logps = torch.randn(B, T, device=device, dtype=dtype) * scalar
+    else:
+        old_per_token_logps = None
+
+    loss1, aux1 = liger_fused_linear_grpo(
+        input1, weight1, selected_token_ids, attention_mask, advantages, bias1, ref_input, ref_weight1, ref_bias1, old_per_token_logps, beta, epsilon_low, epsilon_high, temperature, True, use_ref_model, 1
+    )
+
+    loss2, aux2 = LigerFusedLinearGRPOFunction.apply(
+        input2, weight2, selected_token_ids, attention_mask, advantages, bias2, ref_input, ref_weight2, ref_bias2, old_per_token_logps, beta, epsilon_low, epsilon_high, temperature, True, use_ref_model, 1
+    )
+
+    assert loss1 != float('nan')
+    assert loss2 != float('nan')
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    # Check metrics match
+    assert len(aux1) == len(aux2)
+    for metric1, metric2 in zip(aux1, aux2):
+        assert_verbose_allclose(metric1, metric2, atol=atol, rtol=rtol)
+
+    # Backward pass
+    loss1.backward()
+    loss2.backward()
+
+    # Check gradients match
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, weight2.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, bias2.grad, atol=atol, rtol=rtol)