luspo fix

kashif · kashif · commit bc068d8b52c1 · 2026-03-24T10:23:42.000+01:00
diff --git a/src/liger_kernel/chunked_loss/fused_linear_ppo.py b/src/liger_kernel/chunked_loss/fused_linear_ppo.py
@@ -16,6 +16,7 @@
 
 
 _SELECTIVE_LOGPROB_VOCAB_CHUNK_SIZE = 2048
+_SELECTIVE_LOGPROB_EXACT_VOCAB_THRESHOLD = 4096
 
 
 def _next_power_of_two(x):
@@ -115,8 +116,8 @@ def _selective_logprob_forward_torch(hidden, weight, targets, bias=None, tempera
         end = min(start + vocab_chunk_size, vocab_size)
         chunk_width = end - start
         weight_chunk = weight[start:end].to(hidden.dtype)
-        torch.mm(hidden, weight_chunk.t(), out=mm_buf[:, :chunk_width])
         logits_chunk = logits_buf[:, :chunk_width]
+        torch.mm(hidden, weight_chunk.t(), out=mm_buf[:, :chunk_width])
         logits_chunk.copy_(mm_buf[:, :chunk_width])
         if bias is not None:
             logits_chunk.add_(bias[start:end].to(torch.float32))
@@ -138,6 +139,48 @@ def _selective_logprob_forward_torch(hidden, weight, targets, bias=None, tempera
     return target_logit - log_z, log_z
 
 
+def _selective_logprob_forward_autograd(hidden, weight, targets, bias=None, temperature=1.0, vocab_chunk_size=2048):
+    n_rows, _ = hidden.shape
+    vocab_size, _ = weight.shape
+    inv_t = 1.0 / temperature
+
+    if vocab_size <= _SELECTIVE_LOGPROB_EXACT_VOCAB_THRESHOLD:
+        logits = hidden @ weight.to(hidden.dtype).t()
+        logits = logits.float()
+        if bias is not None:
+            logits = logits + bias.to(torch.float32)
+        logits = logits * inv_t
+        return torch.log_softmax(logits, dim=-1).gather(-1, targets.unsqueeze(-1)).squeeze(-1)
+
+    max_old = torch.full((n_rows,), float("-inf"), device=hidden.device, dtype=torch.float32)
+    sum_exp = torch.zeros((n_rows,), device=hidden.device, dtype=torch.float32)
+    target_logit = torch.zeros((n_rows,), device=hidden.device, dtype=torch.float32)
+    row_idx = torch.arange(n_rows, device=hidden.device)
+
+    for start in range(0, vocab_size, vocab_chunk_size):
+        end = min(start + vocab_chunk_size, vocab_size)
+        logits_chunk = hidden @ weight[start:end].to(hidden.dtype).t()
+        logits_chunk = logits_chunk.float()
+        if bias is not None:
+            logits_chunk = logits_chunk + bias[start:end].to(torch.float32)
+        logits_chunk = logits_chunk * inv_t
+
+        chunk_max = logits_chunk.amax(dim=-1)
+        max_new = torch.maximum(max_old, chunk_max)
+        rescale = torch.exp(max_old - max_new)
+        chunk_exp = torch.exp(logits_chunk - max_new.unsqueeze(-1))
+
+        sum_exp = sum_exp * rescale + chunk_exp.sum(dim=-1)
+        max_old = max_new
+
+        in_chunk = (targets >= start) & (targets < end)
+        local_idx = torch.clamp(targets - start, 0, end - start - 1)
+        target_logit = target_logit + logits_chunk[row_idx, local_idx] * in_chunk
+
+    log_z = max_old + torch.log(sum_exp)
+    return target_logit - log_z
+
+
 def _selective_logprob_forward_triton(hidden, weight, targets, bias=None, temperature=1.0, vocab_chunk_size=2048):
     n_rows, hidden_size = hidden.shape
     block_h = min(128, _next_power_of_two(hidden_size))
@@ -171,18 +214,6 @@ def _selective_logprob_forward_triton(hidden, weight, targets, bias=None, temper
 
 
 def _selective_logprob_forward(hidden, weight, targets, bias=None, temperature=1.0, vocab_chunk_size=2048):
-    if (
-        _TRITON_AVAILABLE
-        and hidden.is_cuda
-        and weight.is_cuda
-        and targets.is_cuda
-        and hidden.is_contiguous()
-        and weight.is_contiguous()
-        and targets.is_contiguous()
-        and (bias is None or (bias.is_cuda and bias.is_contiguous()))
-    ):
-        return _selective_logprob_forward_triton(hidden, weight, targets, bias, temperature, vocab_chunk_size)
-
     return _selective_logprob_forward_torch(hidden, weight, targets, bias, temperature, vocab_chunk_size)
 
 
@@ -226,12 +257,12 @@ def _selective_logprob_backward(hidden, weight, targets, bias, log_z, grad_logpr
     n_rows, _ = hidden.shape
     vocab_size = weight.shape[0]
     has_bias = bias is not None
+    hidden_fp32 = hidden.float()
 
     grad_hidden = torch.zeros(hidden.shape, device=hidden.device, dtype=torch.float32)
     grad_weight = torch.zeros(weight.shape, device=weight.device, dtype=torch.float32)
     grad_bias = torch.zeros((vocab_size,), device=weight.device, dtype=torch.float32) if has_bias else None
 
-    mm_buf = torch.empty((n_rows, vocab_chunk_size), device=hidden.device, dtype=hidden.dtype)
     logits_buf = torch.empty((n_rows, vocab_chunk_size), device=hidden.device, dtype=torch.float32)
 
     grad_logprobs = grad_logprobs.to(torch.float32)
@@ -240,11 +271,16 @@ def _selective_logprob_backward(hidden, weight, targets, bias, log_z, grad_logpr
     for start in range(0, vocab_size, vocab_chunk_size):
         end = min(start + vocab_chunk_size, vocab_size)
         chunk_width = end - start
-        weight_chunk = weight[start:end]
-
-        torch.mm(hidden, weight_chunk.t(), out=mm_buf[:, :chunk_width])
+        weight_chunk = weight[start:end].float()
         logits_chunk = logits_buf[:, :chunk_width]
-        logits_chunk.copy_(mm_buf[:, :chunk_width])
+        allow_tf32 = torch.backends.cuda.matmul.allow_tf32
+        if hidden.is_cuda:
+            torch.backends.cuda.matmul.allow_tf32 = False
+        try:
+            torch.mm(hidden_fp32, weight_chunk.t(), out=logits_chunk)
+        finally:
+            if hidden.is_cuda:
+                torch.backends.cuda.matmul.allow_tf32 = allow_tf32
         if has_bias:
             logits_chunk.add_(bias[start:end].to(torch.float32))
         logits_chunk.mul_(inv_t)
@@ -257,8 +293,15 @@ def _selective_logprob_backward(hidden, weight, targets, bias, log_z, grad_logpr
         grad_logits[row_idx, local_idx] += grad_logprobs * in_chunk
         grad_logits.mul_(inv_t)
 
-        grad_hidden.add_(grad_logits @ weight_chunk.float())
-        grad_weight[start:end].add_(grad_logits.t() @ hidden.float())
+        allow_tf32 = torch.backends.cuda.matmul.allow_tf32
+        if hidden.is_cuda:
+            torch.backends.cuda.matmul.allow_tf32 = False
+        try:
+            grad_hidden.add_(grad_logits @ weight_chunk)
+            grad_weight[start:end].add_(grad_logits.t() @ hidden_fp32)
+        finally:
+            if hidden.is_cuda:
+                torch.backends.cuda.matmul.allow_tf32 = allow_tf32
         if has_bias:
             grad_bias[start:end].add_(grad_logits.sum(dim=0))
 
@@ -385,7 +428,8 @@ def forward(
             delta=delta,
             use_bias_correction_kl=use_bias_correction_kl,
         )
-        compiled_compute_loss = torch.compile(compute_loss) if compiled else compute_loss
+        use_compiled_compute_loss = compiled and weight.shape[0] > _SELECTIVE_LOGPROB_EXACT_VOCAB_THRESHOLD
+        compiled_compute_loss = torch.compile(compute_loss) if use_compiled_compute_loss else compute_loss
 
         def fused_fwd_bwd(
             input_chunk,
@@ -463,7 +507,10 @@ def accumulate_chunk(
                     aggregated_metrics[i].append(metric)
 
         # Process input in chunks based on chunk_size
-        chunks = max(1, _input.shape[0] // chunk_size)
+        if weight.shape[0] <= _SELECTIVE_LOGPROB_EXACT_VOCAB_THRESHOLD:
+            chunks = 1
+        else:
+            chunks = max(1, _input.shape[0] // chunk_size)
         _input_chunks = torch.chunk(_input, chunks=chunks, dim=0)
         _selected_token_ids_chunks = torch.chunk(selected_token_ids, chunks=chunks, dim=0)
         _attention_mask_chunks = torch.chunk(attention_mask, chunks=chunks, dim=0)
@@ -598,13 +645,24 @@ def _compute_chunk_loss(
 
         if use_ref_model and ref_per_token_logps_chunk is None:
             with torch.no_grad():
-                ref_per_token_logps_chunk = LigerFusedLinearPPOBase.chunk_forward(
-                    ref_input_chunk,
-                    ref_weight,
-                    selected_token_ids_chunk,
-                    bias=ref_bias,
-                    temperature=temperature,
-                )
+                if ref_weight.shape[0] <= _SELECTIVE_LOGPROB_EXACT_VOCAB_THRESHOLD:
+                    ref_logits = ref_input_chunk @ ref_weight.t()
+                    if ref_bias is not None:
+                        ref_logits = ref_logits + ref_bias.float()
+                    ref_logits = ref_logits.float()
+                    if temperature != 1.0:
+                        ref_logits = ref_logits / temperature
+                    ref_per_token_logps_chunk = torch.log_softmax(ref_logits, dim=-1).gather(
+                        -1, selected_token_ids_chunk.unsqueeze(-1)
+                    ).squeeze(-1)
+                else:
+                    ref_per_token_logps_chunk = LigerFusedLinearPPOBase.chunk_forward(
+                        ref_input_chunk,
+                        ref_weight,
+                        selected_token_ids_chunk,
+                        bias=ref_bias,
+                        temperature=temperature,
+                    )
 
         # Compute chunk loss and metrics using the provided loss function
         chunk_loss, chunk_metrics = ppo_loss_fn(
@@ -632,16 +690,20 @@ def _compute_chunk_loss(
     @staticmethod
     def chunk_forward(input_chunk, weight, selected_token_ids, bias=None, temperature=1.0):
         """Compute selected-token log probabilities without materializing full vocab logits."""
+        if weight.shape[0] <= _SELECTIVE_LOGPROB_EXACT_VOCAB_THRESHOLD:
+            logits = input_chunk @ weight.t()
+            if bias is not None:
+                logits = logits + bias
+            logits = logits.float()
+            if temperature != 1.0:
+                logits = logits / temperature
+            return torch.log_softmax(logits, dim=-1).gather(-1, selected_token_ids.unsqueeze(-1)).squeeze(-1)
+
         batch_size, seq_len, hidden_size = input_chunk.shape
         hidden = input_chunk.reshape(batch_size * seq_len, hidden_size).contiguous()
         targets = selected_token_ids.reshape(batch_size * seq_len).contiguous()
-        per_token_logps = _ChunkedSelectiveLogProbFunction.apply(
-            hidden,
-            weight,
-            targets,
-            bias,
-            temperature,
-            _SELECTIVE_LOGPROB_VOCAB_CHUNK_SIZE,
+        per_token_logps = _selective_logprob_forward_autograd(
+            hidden, weight, targets, bias, temperature, _SELECTIVE_LOGPROB_VOCAB_CHUNK_SIZE
         )
         return per_token_logps.reshape(batch_size, seq_len)