mta + softmax docs (#730)

AndreSlavescu · lancerts · web-flow · commit b828275a56a7 · 2025-05-31T11:40:49.000-07:00
## Summary
&lt;!--- This is a required section; please describe the main purpose of
this proposed code change. ---&gt;

&lt;!---
## Details
This is an optional section; is there anything specific that reviewers
should be aware of?
---&gt;

## Testing Done
&lt;!--- This is a required section; please describe how this change was
tested. ---&gt;

&lt;!-- 
Replace BLANK with your device type. For example, A100-80G-PCIe

Complete the following tasks before sending your PR, and replace `[ ]`
with
`[x]` to indicate you have done them. 
--&gt;

- Hardware Type: RTX 3090
- [x] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [x] run `make test-convergence` to ensure convergence

Co-authored-by: Shao Tang &lt;tangshao28@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -277,6 +277,8 @@ loss.backward()
 | GeGLU                           | `liger_kernel.transformers.LigerGEGLUMLP`                   |
 | CrossEntropy                    | `liger_kernel.transformers.LigerCrossEntropyLoss`           |
 | Fused Linear CrossEntropy       | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
+| Multi Token Attention           | `liger_kernel.transformers.LigerMultiTokenAttention`        |
+| Softmax                         | `liger_kernel.transformers.LigerSoftmax`                    |
 | Sparsemax                       | `liger_kernel.transformers.LigerSparsemax`                  |
 
 
diff --git a/benchmark/scripts/benchmark_softmax.py b/benchmark/scripts/benchmark_softmax.py
@@ -8,7 +8,7 @@
 from utils import parse_benchmark_script_args
 from utils import run_benchmarks
 
-from liger_kernel.transformers.softmax import LigerKernelSoftmax
+from liger_kernel.transformers.softmax import LigerSoftmax
 from liger_kernel.utils import infer_device
 
 device = infer_device()
@@ -23,7 +23,7 @@ def bench_speed_softmax(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOut
     dtype = extra_benchmark_config["dtype"]
 
     x_shape = (M, N)
-    liger_softmax = LigerKernelSoftmax().to(device).to(dtype)
+    liger_softmax = LigerSoftmax().to(device).to(dtype)
     torch_softmax = torch.nn.Softmax(dim=-1).to(device).to(dtype)
 
     x = torch.randn(x_shape, dtype=dtype, device=device)
@@ -72,7 +72,7 @@ def bench_memory_softmax(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
     dtype = extra_benchmark_config.get("dtype", torch.float32)
 
     torch_softmax = torch.nn.Softmax(dim=-1)
-    liger_softmax = LigerKernelSoftmax().to(device).to(dtype)
+    liger_softmax = LigerSoftmax().to(device).to(dtype)
 
     x = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
 
diff --git a/docs/Low-Level-APIs.md b/docs/Low-Level-APIs.md
@@ -9,6 +9,8 @@
 | GeGLU                           | `liger_kernel.transformers.LigerGEGLUMLP`                   |
 | CrossEntropy                    | `liger_kernel.transformers.LigerCrossEntropyLoss`           |
 | Fused Linear CrossEntropy       | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
+| Multi Token Attention           | `liger_kernel.transformers.LigerMultiTokenAttention`        |
+| Softmax                         | `liger_kernel.transformers.LigerSoftmax`                    |
 | Sparsemax                       | `liger_kernel.transformers.LigerSparsemax`                  |
 
 
@@ -51,6 +53,19 @@ This kernel combines linear transformations with cross-entropy loss calculations
 !!! Example "Try it out"
     You can experiment as shown in this example [here](https://colab.research.google.com/drive/1Z2QtvaIiLm5MWOs7X6ZPS1MN3hcIJFbj?usp=sharing)
 
+### Multi Token Attention
+
+The Multi Token Attention kernel implementation provides and optimized fused implementation of multi-token attention over the implemented Pytorch model baseline. This is a new attention mechanism that can operate on multiple Q and K inputs introduced by Meta Research.
+
+Paper: https://arxiv.org/abs/2504.00927
+
+### Softmax
+
+The Softmax kernel implementation provides an optimized implementation of the softmax operation, which is a fundamental component in neural networks for converting raw scores into probability distributions.
+
+The implementation shows notable speedups compared to the Softmax PyTorch implementation
+
+
 ### Sparsemax
 
 Sparsemax is a sparse alternative to softmax that produces sparse probability distributions. This kernel implements an efficient version of the sparsemax operation that can be used as a drop-in replacement for softmax in attention mechanisms or classification tasks.
diff --git a/src/liger_kernel/transformers/softmax.py b/src/liger_kernel/transformers/softmax.py
@@ -4,7 +4,7 @@
 from liger_kernel.ops.softmax import LigerSoftmaxFunction
 
 
-class LigerKernelSoftmax(nn.Module):
+class LigerSoftmax(nn.Module):
     def __init__(self):
         super().__init__()
 
diff --git a/test/transformers/test_softmax.py b/test/transformers/test_softmax.py
@@ -6,7 +6,7 @@
 from test.utils import supports_bfloat16
 
 from liger_kernel.transformers.functional import liger_softmax
-from liger_kernel.transformers.softmax import LigerKernelSoftmax
+from liger_kernel.transformers.softmax import LigerSoftmax
 from liger_kernel.utils import infer_device
 
 device = infer_device()
@@ -47,7 +47,7 @@ def test_liger_softmax(shape, dtype, atol, rtol):
 
     torch_softmax = torch.nn.Softmax(dim=-1)
     ref_out = torch_softmax(x1)
-    liger_softmax = LigerKernelSoftmax().to(device).to(dtype)
+    liger_softmax = LigerSoftmax().to(device).to(dtype)
     liger_out = liger_softmax(x2)
 
     assert_verbose_allclose(ref_out, liger_out, atol=atol, rtol=rtol)