Merge branch 'main' into update-grpo-loss-type

vaibhavjindal · web-flow · commit 68c9617941a2 · 2026-03-05T16:05:50.000-08:00
diff --git a/README.md b/README.md
@@ -293,6 +293,7 @@ loss.backward()
 | Multi Token Attention           | `liger_kernel.transformers.LigerMultiTokenAttention`        |
 | Softmax                         | `liger_kernel.transformers.LigerSoftmax`                    |
 | Sparsemax                       | `liger_kernel.transformers.LigerSparsemax`                  |
+| mHC (Hyper-Connections)         | `liger_kernel.transformers.LigerMHC`                        |
 
 
 ### Alignment Kernels
diff --git a/docs/Low-Level-APIs.md b/docs/Low-Level-APIs.md
@@ -12,6 +12,7 @@
 | Multi Token Attention           | `liger_kernel.transformers.LigerMultiTokenAttention`        |
 | Softmax                         | `liger_kernel.transformers.LigerSoftmax`                    |
 | Sparsemax                       | `liger_kernel.transformers.LigerSparsemax`                  |
+| mHC (Hyper-Connections)         | `liger_kernel.transformers.LigerMHC`                        |
 
 
 ### RMS Norm
@@ -72,6 +73,41 @@ Sparsemax is a sparse alternative to softmax that produces sparse probability di
 
 The implementation achieves significant speed improvements and memory savings compared to standard PyTorch implementations, particularly for large input tensors.
 
+### mHC (Manifold-Constrained Hyper-Connections)
+
+mHC implements fused Triton kernels for Manifold-Constrained Hyper-Connections ([arXiv:2512.24880](https://arxiv.org/abs/2512.24880)). It wraps an arbitrary layer `F: [..., C] -> [..., C]` with multiple residual streams, constraining the residual routing matrix `H_res` onto the Birkhoff polytope (doubly-stochastic matrices) via Sinkhorn-Knopp iterations to stabilize training.
+
+The `LigerMHC` module takes input of shape `[..., HC, C]` where `HC` is the number of residual streams, and performs:
+
+1. **Coefficients** -- Compute data-dependent routing coefficients (`h_pre`, `h_post`, `h_res`) via fused matmul + RMS normalization + Sinkhorn-Knopp iterations.
+2. **Pre-aggregate** -- `x_in = sum_i h_pre[i] * x[i]`  (shape: `[..., C]`)
+3. **Layer** -- `f_out = layer(x_in)`  (shape: `[..., C]`)
+4. **Post + residual** -- `x_out[o] = sum_i h_res[o,i] * x[i] + h_post[o] * f_out`  (shape: `[..., HC, C]`)
+
+Usage:
+
+```python
+import torch
+import torch.nn as nn
+from liger_kernel.transformers import LigerMHC
+
+# Wrap a linear layer with 4 residual streams of dimension 256
+layer = nn.Linear(256, 256, bias=False, device="cuda", dtype=torch.bfloat16)
+mhc = LigerMHC(layer, hc=4, c=256, phi_dtype=torch.bfloat16).cuda()
+
+# Input: [batch, seq_len, num_streams, channels] in BF16/FP16
+x = torch.randn(2, 128, 4, 256, device="cuda", dtype=torch.bfloat16)
+out = mhc(x)  # shape: [2, 128, 4, 256]
+```
+
+Functional APIs are also available:
+
+- `liger_kernel.transformers.functional.liger_mhc_coeffs` -- Compute routing coefficients
+- `liger_kernel.transformers.functional.liger_mhc_pre` -- Pre-aggregation
+- `liger_kernel.transformers.functional.liger_mhc_post_res` -- Post-aggregation + residual
+- `liger_kernel.transformers.functional.liger_mhc_apply` -- Combined pre + post_res
+- `liger_kernel.transformers.functional.liger_mhc_forward` -- Full forward pass (coeffs + pre + layer + post_res)
+
 ## Alignment Kernels
 
 | **Kernel**                      | **API**                                                     |
diff --git a/src/liger_kernel/ops/swiglu.py b/src/liger_kernel/ops/swiglu.py
@@ -104,13 +104,48 @@ class LigerSiLUMulFunction(torch.autograd.Function):
     @staticmethod
     @ensure_contiguous
     def forward(ctx, a, b):
-        a, b, c = swiglu_forward(a, b)
-        ctx.save_for_backward(a, b)
-        return c
+        if isinstance(a, torch.distributed.tensor.DTensor) or isinstance(b, torch.distributed.tensor.DTensor):
+            device_mesh, placements = (
+                (a.device_mesh, a.placements)
+                if isinstance(a, torch.distributed.tensor.DTensor)
+                else (b.device_mesh, b.placements)
+            )
+
+            # Assume that full tensors are gathered before and identical across
+            # the associated process groups.
+            if not isinstance(a, torch.distributed.tensor.DTensor):
+                a = torch.distributed.tensor.distribute_tensor(a, device_mesh=device_mesh, placements=placements)
+            if not isinstance(b, torch.distributed.tensor.DTensor):
+                b = torch.distributed.tensor.distribute_tensor(b, device_mesh=device_mesh, placements=placements)
+            a_local, b_local, c_local = swiglu_forward(a.to_local(), b.to_local())
+            ctx.save_for_backward(a_local, b_local)
+            ctx.dtensor_metadata = (device_mesh, placements)
+            return torch.distributed.tensor.DTensor.from_local(c_local, device_mesh, placements)
+        else:
+            a, b, c = swiglu_forward(a, b)
+            ctx.save_for_backward(a, b)
+            ctx.dtensor_metadata = None
+            return c
 
     @staticmethod
     @ensure_contiguous
     def backward(ctx, dc):
         a, b = ctx.saved_tensors
+        if ctx.dtensor_metadata is not None:
+            device_mesh, placements = ctx.dtensor_metadata
+
+            # Assume that full tensors are gathered before and identical across
+            # the associated process groups.
+            dc_local = (
+                dc.to_local()
+                if isinstance(dc, torch.distributed.tensor.DTensor)
+                else torch.distributed.tensor.distribute_tensor(dc, device_mesh=device_mesh, placements=placements)
+            )
+            a_local, b_local = swiglu_backward(a, b, dc_local)
+            return (
+                torch.distributed.tensor.DTensor.from_local(a_local, device_mesh, placements),
+                torch.distributed.tensor.DTensor.from_local(b_local, device_mesh, placements),
+            )
+
         a, b = swiglu_backward(a, b, dc)
         return a, b
diff --git a/test/convergence/fp32/test_mini_models.py b/test/convergence/fp32/test_mini_models.py
@@ -1618,12 +1618,12 @@ def run_mini_model(
                     not LLAMA4_AVAILABLE,
                     reason="Llama4 not available in this version of trasnformers",
                 ),
-                pytest.mark.xfail(
-                    reason=(
-                        "RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype:"
-                        " float key.dtype: c10::BFloat16 and value.dtype: c10::BFloat16 instead."
-                    )
-                ),
+                # pytest.mark.xfail(
+                #    reason=(
+                #        "RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype:"
+                #        " float key.dtype: c10::BFloat16 and value.dtype: c10::BFloat16 instead."
+                #    )
+                # ),
             ],
         ),
         ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 5e-3, 1e-5, 5e-3, 1e-5),
diff --git a/test/transformers/test_swiglu.py b/test/transformers/test_swiglu.py
@@ -1,5 +1,8 @@
+import tempfile
+
 import pytest
 import torch
+import torch.multiprocessing as mp
 import transformers
 
 from packaging import version
@@ -16,6 +19,7 @@
 from liger_kernel.transformers.swiglu import LigerExperts
 from liger_kernel.transformers.swiglu import LigerPhi3SwiGLUMLP
 from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+from liger_kernel.utils import infer_comm_backend
 from liger_kernel.utils import infer_device
 
 IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
@@ -405,3 +409,84 @@ def test_correctness_functional(bsz, seq_len, size, dtype, atol, rtol):
     # Check if gradients are close for x
     assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
     assert torch.allclose(b1.grad, b2.grad, atol=atol, rtol=rtol)
+
+
+def _test_dtensor_liger_silumul(rank, world_size, bsz, seq_len, hidden_size, dtype, atol, rtol, file_name):
+    torch.distributed.init_process_group(
+        backend=infer_comm_backend(),
+        init_method=f"file://{file_name}",
+        rank=rank,
+        world_size=world_size,
+    )
+    device = f"{infer_device()}:{rank}" if infer_device() != "cpu" else "cpu"
+    device_mesh = torch.distributed.device_mesh.init_device_mesh(
+        infer_device(), mesh_shape=(world_size,), mesh_dim_names=("tp",)
+    )
+
+    _a = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype)
+    _b = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype)
+
+    # Broadcast from rank 0 so all ranks operate on identical tensors
+    torch.distributed.broadcast(_a, src=0)
+    torch.distributed.broadcast(_b, src=0)
+
+    assert hidden_size % world_size == 0, f"hidden_size ({hidden_size}) must be divisible by world_size ({world_size})"
+
+    # DTensor path: shard inputs along the hidden dim
+    a1 = _a.clone().detach().requires_grad_(True)
+    b1 = _b.clone().detach().requires_grad_(True)
+    da = torch.distributed.tensor.distribute_tensor(
+        a1, device_mesh=device_mesh, placements=[torch.distributed.tensor.Shard(2)]
+    )
+    db = torch.distributed.tensor.distribute_tensor(
+        b1, device_mesh=device_mesh, placements=[torch.distributed.tensor.Shard(2)]
+    )
+
+    # Regular tensor path
+    a2 = _a.clone().detach().requires_grad_(True)
+    b2 = _b.clone().detach().requires_grad_(True)
+
+    c1 = LigerSiLUMulFunction.apply(da, db)
+    c2 = LigerSiLUMulFunction.apply(a2, b2)
+
+    torch.testing.assert_close(c1.full_tensor(), c2, atol=atol, rtol=rtol)
+
+    grad = torch.randn_like(c2)
+    torch.distributed.broadcast(grad, src=0)
+    dgrad = torch.distributed.tensor.distribute_tensor(
+        grad, device_mesh=device_mesh, placements=[torch.distributed.tensor.Shard(2)]
+    )
+
+    c1.backward(dgrad)
+    c2.backward(grad)
+
+    torch.testing.assert_close(da.grad.full_tensor(), a2.grad, atol=atol, rtol=rtol)
+    torch.testing.assert_close(db.grad.full_tensor(), b2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.xfail(
+    torch.cuda.device_count() < 8,
+    reason="Pending multi-GPU host support. This test is expected to pass when run with multi-GPU host.",
+)
+@pytest.mark.parametrize(
+    "world_size, bsz, seq_len, hidden_size",
+    [
+        (4, 2, 2, 8),
+        (8, 9, 7, 64),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        (torch.bfloat16, 2e-1, 2e-2),
+    ],
+)
+def test_dtensor_liger_silumul(world_size, bsz, seq_len, hidden_size, dtype, atol, rtol):
+    with tempfile.NamedTemporaryFile() as f:
+        mp.spawn(
+            _test_dtensor_liger_silumul,
+            args=(world_size, bsz, seq_len, hidden_size, dtype, atol, rtol, f.name),
+            nprocs=world_size,
+            join=True,
+        )