fix checkstyle

shivam15s · shivam15s · commit 9ef2cda25124 · 2025-03-27T17:35:11.000Z
diff --git a/src/liger_kernel/chunked_loss/functional.py b/src/liger_kernel/chunked_loss/functional.py
@@ -1,15 +1,15 @@
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
+from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
 from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDFunction
 from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOFunction
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOFunction
 from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
-from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
 
 liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
 liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
 liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
 liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
 liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
 liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply
-liger_fused_linear_grpo = LigerFusedLinearGRPOFunction.apply
+liger_fused_linear_grpo = LigerFusedLinearGRPOFunction.apply
diff --git a/src/liger_kernel/chunked_loss/fused_linear_rlhf.py b/src/liger_kernel/chunked_loss/fused_linear_rlhf.py
@@ -80,7 +80,14 @@ def forward(
             rlhf_loss_fn=cls.rlhf_loss_fn,
         )
 
-        def fused_fwd_bwd(input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk):
+        def fused_fwd_bwd(
+            input_chunk,
+            selected_token_ids_chunk,
+            attention_mask_chunk,
+            advantages_chunk,
+            ref_input_chunk,
+            old_per_token_logps_chunk,
+        ):
             """Fused forward and backward for a chunk."""
             argnums = (0, 1, 5) if bias is not None else (0, 1)
             return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=True)(
@@ -94,9 +101,21 @@ def fused_fwd_bwd(input_chunk, selected_token_ids_chunk, attention_mask_chunk, a
                 old_per_token_logps_chunk=old_per_token_logps_chunk,  # arg 7
             )
 
-        def accumulate_chunk(input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk=None, old_per_token_logps_chunk=None):
+        def accumulate_chunk(
+            input_chunk,
+            selected_token_ids_chunk,
+            attention_mask_chunk,
+            advantages_chunk,
+            ref_input_chunk=None,
+            old_per_token_logps_chunk=None,
+        ):
             (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
-                input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk
+                input_chunk,
+                selected_token_ids_chunk,
+                attention_mask_chunk,
+                advantages_chunk,
+                ref_input_chunk,
+                old_per_token_logps_chunk,
             )
             if bias is not None:
                 grad_bias.add_(chunk_grad_bias[0])
@@ -132,10 +151,26 @@ def accumulate_chunk(input_chunk, selected_token_ids_chunk, attention_mask_chunk
         _attention_mask_chunks = torch.chunk(attention_mask, chunks=chunks, dim=0)
         _advantages_chunks = torch.chunk(advantages, chunks=chunks, dim=0)
         _ref_input_chunks = torch.chunk(ref_input, chunks=chunks, dim=0) if use_ref_model else [None] * chunks
-        _old_per_token_logps_chunks = torch.chunk(old_per_token_logps, chunks=chunks, dim=0) if old_per_token_logps is not None else [None] * chunks
+        _old_per_token_logps_chunks = (
+            torch.chunk(old_per_token_logps, chunks=chunks, dim=0)
+            if old_per_token_logps is not None
+            else [None] * chunks
+        )
 
-        for input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk in zip(
-            _input_chunks, _selected_token_ids_chunks, _attention_mask_chunks, _advantages_chunks, _ref_input_chunks, _old_per_token_logps_chunks
+        for (
+            input_chunk,
+            selected_token_ids_chunk,
+            attention_mask_chunk,
+            advantages_chunk,
+            ref_input_chunk,
+            old_per_token_logps_chunk,
+        ) in zip(
+            _input_chunks,
+            _selected_token_ids_chunks,
+            _attention_mask_chunks,
+            _advantages_chunks,
+            _ref_input_chunks,
+            _old_per_token_logps_chunks,
         ):
             # Mark dynamic dimensions
             torch._dynamo.mark_dynamic(input_chunk, 1)
@@ -150,7 +185,14 @@ def accumulate_chunk(input_chunk, selected_token_ids_chunk, attention_mask_chunk
             else:
                 old_per_token_logps_chunk = None
 
-            accumulate_chunk(input_chunk, selected_token_ids_chunk, attention_mask_chunk, advantages_chunk, ref_input_chunk, old_per_token_logps_chunk)
+            accumulate_chunk(
+                input_chunk,
+                selected_token_ids_chunk,
+                attention_mask_chunk,
+                advantages_chunk,
+                ref_input_chunk,
+                old_per_token_logps_chunk,
+            )
 
         # Combine gradients
         grad_input = torch.cat(grad_inputs, dim=0)
@@ -196,7 +238,9 @@ def _compute_chunk_loss(
         ref_log_probs = None
         if use_ref_model and ref_input_chunk is not None:
             with torch.no_grad():
-                ref_log_probs, _ = LigerFusedLinearRLHFBase.chunk_forward(ref_input_chunk, ref_weight, bias=ref_bias, temperature=temperature)
+                ref_log_probs, _ = LigerFusedLinearRLHFBase.chunk_forward(
+                    ref_input_chunk, ref_weight, bias=ref_bias, temperature=temperature
+                )
 
         # Compute chunk loss and metrics using the provided loss function
         chunk_loss, chunk_metrics = rlhf_loss_fn(
diff --git a/src/liger_kernel/chunked_loss/grpo_loss.py b/src/liger_kernel/chunked_loss/grpo_loss.py
@@ -41,7 +41,9 @@ def rlhf_loss_fn(
         if beta != 0.0:
             # Compute KL penalty
             kl_div = (
-                torch.exp(ref_token_logprobs - chosen_token_logprobs) - (ref_token_logprobs - chosen_token_logprobs) - 1.0
+                torch.exp(ref_token_logprobs - chosen_token_logprobs)
+                - (ref_token_logprobs - chosen_token_logprobs)
+                - 1.0
             )
             # Combine losses
             per_token_loss = per_token_loss + beta * kl_div
@@ -58,7 +60,8 @@ def rlhf_loss_fn(
         ]
         if beta != 0.0:
             metrics.append(
-                ((kl_div * attention_mask).sum(dim=1) / torch.clamp(attention_mask.sum(dim=1), min=1.0)).sum() / full_batch_size
+                ((kl_div * attention_mask).sum(dim=1) / torch.clamp(attention_mask.sum(dim=1), min=1.0)).sum()
+                / full_batch_size
             )
         return loss, metrics
 
diff --git a/test/chunked_loss/test_grpo_loss.py b/test/chunked_loss/test_grpo_loss.py
@@ -3,8 +3,8 @@
 import torch.nn.functional as F
 
 from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
-from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
 from liger_kernel.chunked_loss.functional import liger_fused_linear_grpo
+from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
 from liger_kernel.utils import infer_device
 from test.utils import assert_verbose_allclose
 from test.utils import set_seed
@@ -16,6 +16,7 @@
 # reset torch compiler cache
 torch.compiler.reset()
 
+
 class TorchLMHeadGRPO(torch.nn.Module):
     def __init__(
         self,
@@ -38,7 +39,7 @@ def __init__(
         self.epsilon_high = epsilon_high
         self.temperature = temperature
         self.use_ref_model = use_ref_model
-        
+
     def forward(
         self,
         x,  # Shape: [batch_size, seq_len, hidden_size]
@@ -48,7 +49,7 @@ def forward(
         ref_input=None,  # Shape: [batch_size, seq_len, hidden_size]
         old_per_token_logps=None,
     ):
-        logits = (x @ self.lin.weight.t())
+        logits = x @ self.lin.weight.t()
         if self.lin.bias is not None:
             logits = logits + self.lin.bias
         if self.temperature != 1.0:
@@ -81,9 +82,7 @@ def forward(
         per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
         if self.beta != 0.0:
             # Compute KL divergence between model and reference model
-            kl_div = (
-                torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1.0
-            )
+            kl_div = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1.0
             per_token_loss = per_token_loss + self.beta * kl_div
 
         # Apply masking and normalize
@@ -171,9 +170,9 @@ def forward(
     "beta, epsilon_low, epsilon_high, temperature",
     [
         # Standard settings
-        (0.1, 0.2, 0.2, 20.0), # set temperature to 20.0 for better numerical stability
+        (0.1, 0.2, 0.2, 20.0),  # set temperature to 20.0 for better numerical stability
         (0.0, 0.1, 0.1, 2.0),
-    ]
+    ],
 )
 @pytest.mark.parametrize("use_ref_model", [True, False])
 @pytest.mark.parametrize("old_per_token_logps", [True, False])
@@ -231,7 +230,9 @@ def test_correctness(
         V, H, device=device, dtype=dtype
     )
     if ref_bias:
-        torch_lm_head_grpo.ref_lin.bias.data = liger_lm_head_grpo.ref_lin.bias.data = torch.randn(V, device=device, dtype=dtype)
+        torch_lm_head_grpo.ref_lin.bias.data = liger_lm_head_grpo.ref_lin.bias.data = torch.randn(
+            V, device=device, dtype=dtype
+        )
 
     # Create inputs with shape [B, T, H]
     _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
@@ -260,15 +261,25 @@ def test_correctness(
 
     # Forward pass with reference model
     loss1, aux1 = torch_lm_head_grpo(
-        input1, selected_token_ids, attention_mask, advantages, ref_input=ref_input, old_per_token_logps=old_per_token_logps
+        input1,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        ref_input=ref_input,
+        old_per_token_logps=old_per_token_logps,
     )
     loss2, aux2 = liger_lm_head_grpo(
-        input2, selected_token_ids, attention_mask, advantages, ref_input=ref_input, old_per_token_logps=old_per_token_logps
+        input2,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        ref_input=ref_input,
+        old_per_token_logps=old_per_token_logps,
     )
 
     # Check losses match
-    assert loss1 != float('nan')
-    assert loss2 != float('nan')
+    assert loss1 != float("nan")
+    assert loss2 != float("nan")
     assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
 
     # Check metrics match
@@ -296,6 +307,7 @@ def test_correctness(
             rtol=rtol,
         )
 
+
 @pytest.mark.parametrize(
     "B, T, H, V",
     [
@@ -316,14 +328,29 @@ def test_correctness(
     "beta, epsilon_low, epsilon_high, temperature",
     [
         # Standard settings
-        (0.1, 0.2, 0.2, 20.0), # set temperature to 20.0 for better numerical stability
+        (0.1, 0.2, 0.2, 20.0),  # set temperature to 20.0 for better numerical stability
         (0.0, 0.1, 0.1, 2.0),
-    ]
+    ],
 )
 @pytest.mark.parametrize("use_ref_model", [True, False])
 @pytest.mark.parametrize("old_per_token_logps", [True, False])
 def test_functional_correctness(
-    B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias, beta, epsilon_low, epsilon_high, temperature, use_ref_model, old_per_token_logps
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+    ref_bias,
+    beta,
+    epsilon_low,
+    epsilon_high,
+    temperature,
+    use_ref_model,
+    old_per_token_logps,
 ):
     _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
     input1 = _input.detach().clone().requires_grad_(True)
@@ -334,7 +361,7 @@ def test_functional_correctness(
     weight2 = _weight.detach().clone().requires_grad_(True)
 
     selected_token_ids = torch.randint(0, V, (B, T), device=device)
-    
+
     attention_mask = torch.ones(B, T, device=device)
 
     advantages = torch.rand(B, device=device, dtype=dtype)
@@ -348,7 +375,7 @@ def test_functional_correctness(
         bias2 = None
 
     ref_input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
-    
+
     _ref_weight = torch.randn(V, H, device=device, dtype=dtype) * scalar
     ref_weight1 = _ref_weight.detach().clone().requires_grad_(True)
     ref_weight2 = _ref_weight.detach().clone().requires_grad_(True)
@@ -367,15 +394,47 @@ def test_functional_correctness(
         old_per_token_logps = None
 
     loss1, aux1 = liger_fused_linear_grpo(
-        input1, weight1, selected_token_ids, attention_mask, advantages, bias1, ref_input, ref_weight1, ref_bias1, old_per_token_logps, beta, epsilon_low, epsilon_high, temperature, True, use_ref_model, 1
+        input1,
+        weight1,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        bias1,
+        ref_input,
+        ref_weight1,
+        ref_bias1,
+        old_per_token_logps,
+        beta,
+        epsilon_low,
+        epsilon_high,
+        temperature,
+        True,
+        use_ref_model,
+        1,
     )
 
     loss2, aux2 = LigerFusedLinearGRPOFunction.apply(
-        input2, weight2, selected_token_ids, attention_mask, advantages, bias2, ref_input, ref_weight2, ref_bias2, old_per_token_logps, beta, epsilon_low, epsilon_high, temperature, True, use_ref_model, 1
+        input2,
+        weight2,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        bias2,
+        ref_input,
+        ref_weight2,
+        ref_bias2,
+        old_per_token_logps,
+        beta,
+        epsilon_low,
+        epsilon_high,
+        temperature,
+        True,
+        use_ref_model,
+        1,
     )
 
-    assert loss1 != float('nan')
-    assert loss2 != float('nan')
+    assert loss1 != float("nan")
+    assert loss2 != float("nan")
     assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
 
     # Check metrics match

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,9 @@ def rlhf_loss_fn(`
`41`	`41`	`if beta != 0.0:`
`42`	`42`	`# Compute KL penalty`
`43`	`43`	`kl_div = (`
`44`		`- torch.exp(ref_token_logprobs - chosen_token_logprobs) - (ref_token_logprobs - chosen_token_logprobs) - 1.0`
	`44`	`+ torch.exp(ref_token_logprobs - chosen_token_logprobs)`
	`45`	`+ - (ref_token_logprobs - chosen_token_logprobs)`
	`46`	`+ - 1.0`
`45`	`47`	`)`
`46`	`48`	`# Combine losses`
`47`	`49`	`per_token_loss = per_token_loss + beta * kl_div`
`@@ -58,7 +60,8 @@ def rlhf_loss_fn(`
`58`	`60`	`]`
`59`	`61`	`if beta != 0.0:`
`60`	`62`	`metrics.append(`
`61`		`- ((kl_div * attention_mask).sum(dim=1) / torch.clamp(attention_mask.sum(dim=1), min=1.0)).sum() / full_batch_size`
	`63`	`+ ((kl_div * attention_mask).sum(dim=1) / torch.clamp(attention_mask.sum(dim=1), min=1.0)).sum()`
	`64`	`+ / full_batch_size`
`62`	`65`	`)`
`63`	`66`	`return loss, metrics`
`64`	`67`