mhc: code quality cleanup across ops, tests, and benchmarks

yukiu00 · yukiu00 · commit 5c9eca9e49cc · 2026-02-09T21:42:42.000+09:00
- Remove no-op mask=True from Sinkhorn backward kernels
- Drop unused rms_eps/pre_eps from ctx.meta in coeffs backward
- Remove redundant .contiguous() calls inside @ensure_contiguous methods
- Simplify grad_x reshape to use x_shape directly
- Simplify device detection in LigerMHC to try/except pattern
- Replace torch.allclose with assert_verbose_allclose in tests
- Standardize seed to set_seed(42) across all tests
- Merge test_mhc_coeffs_allow_fp32 into test_mhc_coeffs_forward_backward
- Add backward coverage to test_mhc_pre_and_post_res_match_reference
- Widen bf16 tolerance for layer.weight.grad and phi.grad in module test
- Move hardcoded B into extra_benchmark_configs (benchmark_mhc.py)
- Rename MiniMHCLM to BenchMiniMHCLM in benchmark_mhc_lm.py
- Split _build_models into single-provider _build_model
diff --git a/benchmark/scripts/benchmark_mhc.py b/benchmark/scripts/benchmark_mhc.py
@@ -20,13 +20,12 @@
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 
-B = 4
-
 
 def bench_speed_mhc(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
     from test.transformers.test_mhc import mhc_coeffs_ref
 
     T = input.x
+    B = input.extra_benchmark_config["B"]
     HC = input.extra_benchmark_config["HC"]
     C = input.extra_benchmark_config["C"]
     sub_kernel = input.extra_benchmark_config["sub_kernel"]
@@ -135,6 +134,7 @@ def bench_memory_mhc(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput
     from test.transformers.test_mhc import mhc_coeffs_ref
 
     T = input.x
+    B = input.extra_benchmark_config["B"]
     HC = input.extra_benchmark_config["HC"]
     C = input.extra_benchmark_config["C"]
     sub_kernel = input.extra_benchmark_config["sub_kernel"]
@@ -224,6 +224,7 @@ def full():
             "kernel_providers": ["liger", "torch"],
             "extra_benchmark_configs": [
                 {
+                    "B": 4,
                     "HC": 4,
                     "C": 4096,
                     "tmax": 20,
diff --git a/benchmark/scripts/benchmark_mhc_lm.py b/benchmark/scripts/benchmark_mhc_lm.py
@@ -226,7 +226,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class MiniMHCLM(nn.Module):
+class BenchMiniMHCLM(nn.Module):
     def __init__(
         self,
         mhc_cls: type[nn.Module],
@@ -274,7 +274,8 @@ def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.lm_head(x)
 
 
-def _build_models(
+def _build_model(
+    provider: str,
     *,
     hidden_size: int,
     hc: int,
@@ -285,8 +286,9 @@ def _build_models(
     tmax: int,
     dtype: torch.dtype,
 ):
-    liger_model = MiniMHCLM(
-        LigerMHC,
+    mhc_cls = LigerMHC if provider == "liger" else TorchMHC
+    return BenchMiniMHCLM(
+        mhc_cls,
         vocab_size=vocab_size,
         hidden_size=hidden_size,
         hc=hc,
@@ -297,20 +299,6 @@ def _build_models(
         dtype=dtype,
         device=device,
     )
-    torch_model = MiniMHCLM(
-        TorchMHC,
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        hc=hc,
-        num_layers=num_layers,
-        num_heads=num_heads,
-        intermediate_mult=intermediate_mult,
-        tmax=tmax,
-        dtype=dtype,
-        device=device,
-    )
-    torch_model.load_state_dict(liger_model.state_dict())
-    return liger_model, torch_model
 
 
 def bench_speed_mhc_lm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
@@ -331,7 +319,8 @@ def bench_speed_mhc_lm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutp
     if hidden_size % num_heads != 0:
         raise ValueError("hidden_size must be divisible by num_heads")
 
-    liger_model, torch_model = _build_models(
+    model = _build_model(
+        provider,
         hidden_size=hidden_size,
         hc=hc,
         num_layers=num_layers,
@@ -345,16 +334,12 @@ def bench_speed_mhc_lm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutp
     input_ids = torch.randint(0, vocab_size, (bsz, seq_len), device=device)
 
     def fwd():
-        if provider == "liger":
-            return liger_model(input_ids)
-        if provider == "torch":
-            return torch_model(input_ids)
-        raise ValueError(f"Unknown provider: {provider}")
+        return model(input_ids)
 
     def fwd_loss():
         return fwd().float().mean()
 
-    grad_to_none = list(liger_model.parameters()) if provider == "liger" else list(torch_model.parameters())
+    grad_to_none = list(model.parameters())
 
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, grad_to_none=grad_to_none, rep=100)
@@ -400,7 +385,8 @@ def bench_memory_mhc_lm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOut
     if hidden_size % num_heads != 0:
         raise ValueError("hidden_size must be divisible by num_heads")
 
-    liger_model, torch_model = _build_models(
+    model = _build_model(
+        provider,
         hidden_size=hidden_size,
         hc=hc,
         num_layers=num_layers,
@@ -414,11 +400,7 @@ def bench_memory_mhc_lm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOut
     input_ids = torch.randint(0, vocab_size, (bsz, seq_len), device=device)
 
     def fwd():
-        if provider == "liger":
-            return liger_model(input_ids)
-        if provider == "torch":
-            return torch_model(input_ids)
-        raise ValueError(f"Unknown provider: {provider}")
+        return model(input_ids)
 
     def full():
         loss = fwd().float().mean()
diff --git a/src/liger_kernel/ops/mhc.py b/src/liger_kernel/ops/mhc.py
@@ -546,8 +546,6 @@ def _mhc_sinkhorn_bwd_kernel(
     # Start backward from grad_out
     g = tl.load(
         grad_out_ptr + pid * stride_go_n + rows * stride_go_i + cols * stride_go_j,
-        mask=True,
-        other=0.0,
     ).to(tl.float32)
 
     # Reverse iterations (TMAX-1 .. 1), recomputing mat_t, rs_t, cs_t
@@ -641,8 +639,6 @@ def _mhc_sinkhorn_bwd_hist_kernel(
     # Start backward from grad_out
     g = tl.load(
         grad_out_ptr + pid * stride_go_n + rows * stride_go_i + cols * stride_go_j,
-        mask=True,
-        other=0.0,
     ).to(tl.float32)
 
     # Reverse iterations (TMAX-1 .. 1) using stored mats
@@ -1471,8 +1467,6 @@ def forward(  # type: ignore[override]
             HC,
             C,
             int(tmax),
-            float(rms_eps),
-            float(pre_eps),
             float(sinkhorn_eps),
             float(post_mult),
             hist is not None,
@@ -1495,7 +1489,7 @@ def backward(
         grad_h_res: torch.Tensor | None,
     ):
         saved = ctx.saved_tensors
-        x_shape, HC, C, tmax, rms_eps, pre_eps, sinkhorn_eps, post_mult, has_hist = ctx.meta
+        x_shape, HC, C, tmax, sinkhorn_eps, post_mult, has_hist = ctx.meta
         if has_hist:
             x_mat, phi, b, mix, invr, alpha_pre, alpha_post, alpha_res, hist = saved
         else:
@@ -1511,15 +1505,15 @@ def backward(
 
         # flatten grads (None -> zeros)
         if need_pre:
-            gh_pre = grad_h_pre.contiguous().view(-1, HC).to(torch.float32)
+            gh_pre = grad_h_pre.view(-1, HC).to(torch.float32)
         else:
             gh_pre = torch.zeros((N, HC), device=mix.device, dtype=torch.float32)
         if need_post:
-            gh_post = grad_h_post.contiguous().view(-1, HC).to(torch.float32)
+            gh_post = grad_h_post.view(-1, HC).to(torch.float32)
         else:
             gh_post = torch.zeros((N, HC), device=mix.device, dtype=torch.float32)
         if need_res:
-            gh_res = grad_h_res.contiguous().view(-1, HC, HC).to(torch.float32)
+            gh_res = grad_h_res.view(-1, HC, HC).to(torch.float32)
         else:
             gh_res = torch.zeros((N, HC, HC), device=mix.device, dtype=torch.float32)
 
@@ -1599,7 +1593,7 @@ def backward(
         )
 
         # Reshape to original shape
-        grad_x = grad_x_mat.view(*x_shape[:-2], HC, C)
+        grad_x = grad_x_mat.view(x_shape)
 
         # Return grads for each forward input
         return (
@@ -1624,7 +1618,7 @@ class LigerMHCPreFunction(torch.autograd.Function):
     def forward(ctx: Any, x: torch.Tensor, h_pre: torch.Tensor) -> torch.Tensor:
         x_shape = x.shape
         x_flat, _ = _flatten_tokens(x)
-        h_pre_flat = h_pre.contiguous().view(-1, x_flat.shape[1]).to(torch.float32)
+        h_pre_flat = h_pre.view(-1, x_flat.shape[1]).to(torch.float32)
         out = mhc_pre_fwd(x_flat, h_pre_flat)  # [N,C] fp32
         ctx.save_for_backward(x_flat, h_pre_flat)
         ctx.x_shape = x_shape
@@ -1637,7 +1631,7 @@ def backward(ctx: Any, grad_out: torch.Tensor):
         x_flat, h_pre_flat = ctx.saved_tensors
         x_shape = ctx.x_shape
         N, HC, C = x_flat.shape
-        go = grad_out.contiguous().view(-1, C).to(torch.float32)
+        go = grad_out.view(-1, C).to(torch.float32)
         grad_x, grad_h = mhc_pre_bwd(x_flat, h_pre_flat, go)
         grad_x = grad_x.to(x_flat.dtype)
         return grad_x.view(*x_shape), grad_h.view(*x_shape[:-1])
@@ -1652,9 +1646,9 @@ def forward(
         x_shape = x.shape
         x_flat, _ = _flatten_tokens(x)
         N, HC, C = x_flat.shape
-        f_flat = f_out.contiguous().view(-1, C)
-        h_post_flat = h_post.contiguous().view(-1, HC).to(torch.float32)
-        h_res_flat = h_res.contiguous().view(-1, HC, HC).to(torch.float32)
+        f_flat = f_out.view(-1, C)
+        h_post_flat = h_post.view(-1, HC).to(torch.float32)
+        h_res_flat = h_res.view(-1, HC, HC).to(torch.float32)
         out = mhc_post_res_fwd(x_flat, f_flat, h_post_flat, h_res_flat)  # [N,HC,C] fp32
         ctx.save_for_backward(x_flat, f_flat, h_post_flat, h_res_flat)
         ctx.x_shape = x_shape
@@ -1667,7 +1661,7 @@ def backward(ctx: Any, grad_out: torch.Tensor):
         x_flat, f_flat, h_post_flat, h_res_flat = ctx.saved_tensors
         x_shape = ctx.x_shape
         N, HC, C = x_flat.shape
-        go = grad_out.contiguous().view(-1, HC, C).to(torch.float32)
+        go = grad_out.view(-1, HC, C).to(torch.float32)
 
         grad_x, grad_f, grad_hpost, grad_hres = mhc_post_res_bwd(x_flat, f_flat, h_post_flat, h_res_flat, go)
 
diff --git a/src/liger_kernel/transformers/mhc.py b/src/liger_kernel/transformers/mhc.py
@@ -113,15 +113,9 @@ def __init__(
         m = hc * hc + 2 * hc
         k = hc * c
 
-        layer_device = None
-        for param in self.layer.parameters(recurse=True):
-            layer_device = param.device
-            break
-        if layer_device is None:
-            for buf in self.layer.buffers(recurse=True):
-                layer_device = buf.device
-                break
-        if layer_device is None:
+        try:
+            layer_device = next(self.layer.parameters()).device
+        except StopIteration:
             layer_device = torch.device("cpu")
 
         # Note: for best speed, keep phi in BF16/FP16 to enable tensor-core matmul in Triton.
diff --git a/test/transformers/test_mhc.py b/test/transformers/test_mhc.py