fix(benchmark): move chunked loss module init out of measurements (#643)

Tcc0403 · web-flow · commit 1ea9175a5133 · 2025-07-22T15:42:04.000-07:00
## Summary  Fix #789 ## Testing Done   - Hardware Type: <BLANK> - [ ] run `make test` to ensure correctness - [ ] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com>
diff --git a/benchmark/scripts/benchmark_cpo_loss.py b/benchmark/scripts/benchmark_cpo_loss.py
@@ -36,17 +36,20 @@ def bench_memory_fused_linear_cpo_loss(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
 
-    torch_lm_head_cpo = lambda x, target: TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
-    liger_lm_head_cpo = lambda x, target: LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target: torch_lm_head_cpo(x, target)[0]
+    liger_fwd = lambda x, target: liger_lm_head_cpo(x, target)[0]
 
     _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (B, T), dtype=torch.long, device=device)
 
     def fwd():
         if provider == "liger":
-            return liger_lm_head_cpo(_input, target)
+            return liger_fwd(_input, target)
         elif provider == "huggingface":
-            return torch_lm_head_cpo(_input, target)
+            return torch_fwd(_input, target)
 
     def full():
         y = fwd()
@@ -79,17 +82,20 @@ def bench_speed_fused_linear_cpo_loss(
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
 
-    torch_lm_head_cpo = lambda x, target: TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
-    liger_lm_head_cpo = lambda x, target: LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target: torch_lm_head_cpo(x, target)[0]
+    liger_fwd = lambda x, target: liger_lm_head_cpo(x, target)[0]
 
     _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (B, T), dtype=torch.long, device=device)
 
     def fwd():
         if provider == "liger":
-            return liger_lm_head_cpo(_input, target)
+            return liger_fwd(_input, target)
         elif provider == "huggingface":
-            return torch_lm_head_cpo(_input, target)
+            return torch_fwd(_input, target)
 
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(
diff --git a/benchmark/scripts/benchmark_dpo_loss.py b/benchmark/scripts/benchmark_dpo_loss.py
@@ -32,12 +32,11 @@ def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
     ignore_index = input.extra_benchmark_config["ignore_index"]
     provider = input.kernel_provider
 
-    torch_dpo_loss = lambda x, ref_x, target: TorchLMHeadDPO(
-        H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
-    ).to(device)(x, ref_x, target)[0]
-    liger_dpo_loss = lambda x, ref_x, target: LigerLMHeadDPO(
-        H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
-    ).to(device)(x, ref_x, target)[0]
+    # Instantiate once and retrieve the first output only
+    torch_dpo_loss = TorchLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    liger_dpo_loss = LigerLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    torch_fwd = lambda x, ref_x, target: torch_dpo_loss(x, ref_x, target)[0]
+    liger_fwd = lambda x, ref_x, target: liger_dpo_loss(x, ref_x, target)[0]
 
     # Input shape: [B, T, H]
     _input = torch.randn(B, T, H, device=device, dtype=dtype)
@@ -52,9 +51,9 @@ def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
 
     def fwd():
         if provider == "liger":
-            return liger_dpo_loss(_input, ref_input, target)
+            return liger_fwd(_input, ref_input, target)
         elif provider == "huggingface":
-            return torch_dpo_loss(_input, ref_input, target)
+            return torch_fwd(_input, ref_input, target)
 
     def full():
         y = fwd()
@@ -83,12 +82,11 @@ def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
 
-    torch_dpo_loss = lambda x, ref_x, target: TorchLMHeadDPO(
-        H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
-    ).to(device)(x, ref_x, target)[0]
-    liger_dpo_loss = lambda x, ref_x, target: LigerLMHeadDPO(
-        H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
-    ).to(device)(x, ref_x, target)[0]
+    # Instantiate once and retrieve the first output only
+    torch_dpo_loss = TorchLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    liger_dpo_loss = LigerLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    torch_fwd = lambda x, ref_x, target: torch_dpo_loss(x, ref_x, target)[0]
+    liger_fwd = lambda x, ref_x, target: liger_dpo_loss(x, ref_x, target)[0]
 
     # Input shape: [B, T, H]
     _input = torch.randn(B, T, H, device=device, dtype=dtype)
@@ -103,9 +101,9 @@ def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
 
     def fwd():
         if provider == "liger":
-            return liger_dpo_loss(_input, ref_input, target)
+            return liger_fwd(_input, ref_input, target)
         elif provider == "huggingface":
-            return torch_dpo_loss(_input, ref_input, target)
+            return torch_fwd(_input, ref_input, target)
 
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(
diff --git a/benchmark/scripts/benchmark_orpo_loss.py b/benchmark/scripts/benchmark_orpo_loss.py
@@ -36,18 +36,21 @@ def bench_memory_fused_linear_orpo_loss(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
 
-    torch_lm_head_orpo = lambda x, target: TorchLMHeadORPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
-    liger_lm_head_orpo = lambda x, target: LigerLMHeadORPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_orpo = TorchLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_orpo = LigerLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target, nll_target: torch_lm_head_orpo(x, target, nll_target)[0]
+    liger_fwd = lambda x, target, nll_target: liger_lm_head_orpo(x, target, nll_target)[0]
 
     _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (B, T), dtype=torch.long, device=device)
     nll_target = torch.randint(V, (B, T), dtype=torch.long, device=device)
 
     def fwd():
         if provider == "liger":
-            return liger_lm_head_orpo(_input, target, nll_target)
+            return liger_fwd(_input, target, nll_target)
         elif provider == "huggingface":
-            return torch_lm_head_orpo(_input, target, nll_target)
+            return torch_fwd(_input, target, nll_target)
 
     def full():
         y = fwd()
@@ -80,18 +83,21 @@ def bench_speed_fused_linear_orpo_loss(
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
 
-    torch_lm_head_orpo = lambda x, target: TorchLMHeadORPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
-    liger_lm_head_orpo = lambda x, target: LigerLMHeadORPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_orpo = TorchLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_orpo = LigerLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target, nll_target: torch_lm_head_orpo(x, target, nll_target)[0]
+    liger_fwd = lambda x, target, nll_target: liger_lm_head_orpo(x, target, nll_target)[0]
 
     _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (B, T), dtype=torch.long, device=device)
     nll_target = torch.randint(V, (B, T), dtype=torch.long, device=device)
 
     def fwd():
         if provider == "liger":
-            return liger_lm_head_orpo(_input, target, nll_target)
+            return liger_fwd(_input, target, nll_target)
         elif provider == "huggingface":
-            return torch_lm_head_orpo(_input, target, nll_target)
+            return torch_fwd(_input, target, nll_target)
 
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(
diff --git a/benchmark/scripts/benchmark_simpo_loss.py b/benchmark/scripts/benchmark_simpo_loss.py
@@ -36,17 +36,20 @@ def bench_memory_fused_linear_simpo_loss(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
 
-    torch_lm_head_simpo = lambda x, target: TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
-    liger_lm_head_simpo = lambda x, target: LigerLMHeadSimPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_simpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_simpo = LigerLMHeadSimPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target: torch_lm_head_simpo(x, target)[0]
+    liger_fwd = lambda x, target: liger_lm_head_simpo(x, target)[0]
 
     _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (B, T), dtype=torch.long, device=device)
 
     def fwd():
         if provider == "liger":
-            return liger_lm_head_simpo(_input, target)
+            return liger_fwd(_input, target)
         elif provider == "huggingface":
-            return torch_lm_head_simpo(_input, target)
+            return torch_fwd(_input, target)
 
     def full():
         y = fwd()
@@ -79,17 +82,20 @@ def bench_speed_fused_linear_simpo_loss(
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
 
-    torch_lm_head_simpo = lambda x, target: TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
-    liger_lm_head_simpo = lambda x, target: LigerLMHeadSimPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_simpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_simpo = LigerLMHeadSimPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target: torch_lm_head_simpo(x, target)[0]
+    liger_fwd = lambda x, target: liger_lm_head_simpo(x, target)[0]
 
     _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (B, T), dtype=torch.long, device=device)
 
     def fwd():
         if provider == "liger":
-            return liger_lm_head_simpo(_input, target)
+            return liger_fwd(_input, target)
         elif provider == "huggingface":
-            return torch_lm_head_simpo(_input, target)
+            return torch_fwd(_input, target)
 
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(