diff --git a/‎src/liger_kernel/chunked_loss/fused_linear_rlhf.py‎
Lines changed: 87 additions & 74 deletions b/‎src/liger_kernel/chunked_loss/fused_linear_rlhf.py‎
Lines changed: 87 additions & 74 deletions
@@ -2,6 +2,7 @@
 from functools import partial
 
 import torch
+import torch._dynamo.config
 import torch.nn.functional as F
 
 
@@ -20,15 +21,18 @@ def forward(
         _input,
         weight,
         attention_mask,
-        rewards,
+        advantages,
         bias=None,
-        num_generations=4,
-        beta=0.1,
-        compiled=True,
-        use_ref_model=False,
         ref_input=None,
         ref_weight=None,
         ref_bias=None,
+        old_per_token_logps=None,
+        epsilon_low=0.2,
+        epsilon_high=0.2,
+        beta=0.1,
+        temperature=1.0,
+        compiled=True,
+        use_ref_model=False,
         chunk_size=1,
     ):
         """Chunked forward pass for RLHF loss computation.
@@ -39,21 +43,20 @@ def forward(
             _input: Input tensor
             weight: Weight tensor
             attention_mask: Attention mask tensor
-            rewards: Rewards tensor
+            advantages: Advantages tensor
             bias: Bias tensor
-            num_generations: Number of generations per prompt
-            beta: Weight for the KL penalty
-            compiled: Whether to use torch compile
-            use_ref_model: Whether to use a reference model
             ref_input: Reference model input tensor
             ref_weight: Reference model weight tensor
             ref_bias: Reference model bias tensor
+            old_per_token_logps: Old per token log probabilities tensor
+            epsilon_low: Lower bound for clipping the importance sampling ratio
+            epsilon_high: Upper bound for clipping the importance sampling ratio
+            beta: Weight for the KL penalty
+            temperature: Temperature for the logits
+            compiled: Whether to use torch compile
+            use_ref_model: Whether to use a reference model
             chunk_size: Size of chunks for processing in other loss modules
         """
-        # Save for backward
-        ctx.beta = beta
-        ctx.rewards = rewards
-
         # Initialize accumulators
         loss_acc = torch.zeros((), device=_input.device)
         grad_weight = torch.zeros_like(weight)  # [V, H]
@@ -64,43 +67,36 @@ def forward(
         # Create a partial function with fixed arguments
         compute_loss = partial(
             LigerFusedLinearRLHFBase._compute_chunk_loss,
-            beta=beta,
-            use_ref_model=use_ref_model,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            full_attention_mask=attention_mask,
+            epsilon_low=epsilon_low,
+            epsilon_high=epsilon_high,
+            beta=beta,
+            temperature=temperature,
+            use_ref_model=use_ref_model,
             rlhf_loss_fn=cls.rlhf_loss_fn,
         )
 
-        def fused_fwd_bwd(input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk):
+        def fused_fwd_bwd(input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk):
             """Fused forward and backward for a chunk."""
+            argnums = (0, 1, 4) if bias is not None else (0, 1)
+            return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=True)(
+                input_chunk,  # arg 0
+                weight,  # arg 1
+                attention_mask_chunk,  # arg 2
+                advantages_chunk,  # arg 3
+                bias,  # arg 4
+                ref_input_chunk=ref_input_chunk,  # arg 5
+                old_per_token_logps_chunk=old_per_token_logps_chunk,  # arg 6
+            )
+
+        def accumulate_chunk(input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk=None, old_per_token_logps_chunk=None):
+            (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
+                input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk
+            )
             if bias is not None:
-                return torch.func.grad_and_value(compute_loss, argnums=(0, 1, 5), has_aux=True)(
-                    input_chunk,  # arg 0
-                    weight,  # arg 1
-                    attention_mask_chunk,  # arg 2
-                    rewards_chunk,  # arg 3
-                    ref_input_chunk,  # arg 4
-                    bias,  # arg 5
-                )
-            else:
-                return torch.func.grad_and_value(compute_loss, argnums=(0, 1), has_aux=True)(
-                    input_chunk,  # arg 0
-                    weight,  # arg 1
-                    attention_mask_chunk,  # arg 2
-                    rewards_chunk,  # arg 3
-                    ref_input_chunk,  # arg 4
-                )
-
-        def accumulate_chunk(input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk=None):
-            if bias is not None:
-                (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
-                    input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk
-                )
-                grad_bias.add_(chunk_grad_bias)
-            else:
-                (chunk_grad_input, chunk_grad_weight), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
-                    input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk
-                )
+                grad_bias.add_(chunk_grad_bias[0])
 
             # Accumulate gradients and loss
             grad_weight.add_(chunk_grad_weight)
@@ -123,28 +119,34 @@ def accumulate_chunk(input_chunk, attention_mask_chunk, rewards_chunk, ref_input
                     aggregated_metrics[i].append(metric)
 
         if compiled:
-            accumulate_chunk = torch.compile(accumulate_chunk)
+            # TODO: Figure out what is better to compile here
+            # accumulate_chunk = torch.compile(accumulate_chunk)
+            fused_fwd_bwd = torch.compile(fused_fwd_bwd)
 
-        # Process input in chunks based on num_generations
-        chunks = max(1, _input.shape[0] // num_generations)
+        # Process input in chunks based on chunk_size
+        chunks = max(1, _input.shape[0] // chunk_size)
         _input_chunks = torch.chunk(_input, chunks=chunks, dim=0)
         _attention_mask_chunks = torch.chunk(attention_mask, chunks=chunks, dim=0)
-        _rewards_chunks = torch.chunk(rewards, chunks=chunks, dim=0)
+        _advantages_chunks = torch.chunk(advantages, chunks=chunks, dim=0)
         _ref_input_chunks = torch.chunk(ref_input, chunks=chunks, dim=0) if use_ref_model else [None] * chunks
+        _old_per_token_logps_chunks = torch.chunk(old_per_token_logps, chunks=chunks, dim=0) if old_per_token_logps is not None else [None] * chunks
 
-        for input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk in zip(
-            _input_chunks, _attention_mask_chunks, _rewards_chunks, _ref_input_chunks
+        for input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk in zip(
+            _input_chunks, _attention_mask_chunks, _advantages_chunks, _ref_input_chunks, _old_per_token_logps_chunks
         ):
             # Mark dynamic dimensions
             torch._dynamo.mark_dynamic(input_chunk, 1)
             torch._dynamo.mark_dynamic(attention_mask_chunk, 1)
-            if ref_input_chunk is not None:
+            if use_ref_model:
                 torch._dynamo.mark_dynamic(ref_input_chunk, 1)
+            else:
+                ref_input_chunk = None
+            if old_per_token_logps is not None:
+                torch._dynamo.mark_dynamic(old_per_token_logps_chunk, 1)
+            else:
+                old_per_token_logps_chunk = None
 
-            accumulate_chunk(input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk)
-
-        # Scale accumulated loss by number of chunks since we're averaging
-        loss_acc = loss_acc / chunks
+            accumulate_chunk(input_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk)
 
         # Combine gradients
         grad_input = torch.cat(grad_inputs, dim=0)
@@ -158,7 +160,7 @@ def accumulate_chunk(input_chunk, attention_mask_chunk, rewards_chunk, ref_input
             if isinstance(metric, list):
                 final_metrics.append(torch.cat(metric, dim=0))
             else:
-                final_metrics.append(metric / chunks)
+                final_metrics.append(metric)
 
         return loss_acc, tuple(final_metrics)
 
@@ -167,51 +169,59 @@ def _compute_chunk_loss(
         input_chunk,
         weight,
         attention_mask_chunk,
-        rewards_chunk,
-        ref_input_chunk=None,
+        advantages_chunk,
         bias=None,
-        beta=0.1,
-        use_ref_model=False,
+        ref_input_chunk=None,
         ref_weight=None,
         ref_bias=None,
+        old_per_token_logps_chunk=None,
+        full_attention_mask=None,
+        epsilon_low=0.2,
+        epsilon_high=0.2,
+        beta=0.1,
+        temperature=1.0,
+        use_ref_model=False,
         rlhf_loss_fn=None,
     ):
         """Compute loss for a single chunk."""
         # Get policy log probabilities using chunk_forward
-        log_probs, _, logits_mean = LigerFusedLinearRLHFBase.chunk_forward(input_chunk, weight, bias=bias)
+        log_probs, _ = LigerFusedLinearRLHFBase.chunk_forward(input_chunk, weight, bias=bias, temperature=temperature)
 
         # Get reference log probabilities if needed
         ref_log_probs = None
         if use_ref_model and ref_input_chunk is not None:
             with torch.no_grad():
-                ref_log_probs, _, _ = LigerFusedLinearRLHFBase.chunk_forward(ref_input_chunk, ref_weight, bias=ref_bias)
+                ref_log_probs, _ = LigerFusedLinearRLHFBase.chunk_forward(ref_input_chunk, ref_weight, bias=ref_bias, temperature=temperature)
 
         # Compute chunk loss and metrics using the provided loss function
         chunk_loss, chunk_metrics = rlhf_loss_fn(
             log_probs=log_probs,
             attention_mask=attention_mask_chunk,
-            rewards=rewards_chunk,
+            advantages=advantages_chunk,
+            full_attention_mask=full_attention_mask,
             ref_log_probs=ref_log_probs,
+            old_per_token_logps=old_per_token_logps_chunk,
+            epsilon_low=epsilon_low,
+            epsilon_high=epsilon_high,
             beta=beta,
         )
 
-        return chunk_loss, (logits_mean, *chunk_metrics)
+        return chunk_loss, chunk_metrics
 
     @staticmethod
-    def chunk_forward(input_chunk, weight, bias=None):
+    def chunk_forward(input_chunk, weight, bias=None, temperature=1.0):
         """Forward pass computation for a single chunk without explicit reshaping."""
         # Directly compute logits via batched matrix multiplication: [B, T, H] @ [H, V] -> [B, T, V]
         logits = torch.matmul(input_chunk, weight.t())
         if bias is not None:
             logits = logits + bias  # Broadcasts bias to [B, T, V]
+        if temperature != 1.0:
+            logits = logits / temperature
 
         # Compute log probabilities using softmax over the last dimension
         log_probs = F.log_softmax(logits.float(), dim=-1)
 
-        # Monitoring: compute mean of logits
-        batch_size, seq_len, _ = input_chunk.shape
-        logits_mean = logits.sum() / (batch_size * seq_len * weight.shape[0])
-        return log_probs, logits, logits_mean
+        return log_probs, logits
 
     @staticmethod
     def backward(ctx, grad_output, *grad_metrics):
@@ -227,14 +237,17 @@ def backward(ctx, grad_output, *grad_metrics):
             grad_input,
             grad_weight,
             None,  # grad_attention_mask
-            None,  # grad_rewards
+            None,  # grad_advantages
             grad_bias,
-            None,  # grad_num_generations
-            None,  # grad_beta
-            None,  # grad_compiled
-            None,  # grad_use_ref_model
             None,  # grad_ref_input
             None,  # grad_ref_weight
             None,  # grad_ref_bias
+            None,  # grad_old_per_token_logps
+            None,  # grad_epsilon_low
+            None,  # grad_epsilon_high
+            None,  # grad_beta
+            None,  # grad_temperature
+            None,  # grad_compiled
+            None,  # grad_use_ref_model
             None,  # grad_chunk_size
         )