NVIDIA-NeMo
diff --git a/‎docker/Dockerfile.ci‎
Lines changed: 0 additions & 1 deletion b/‎docker/Dockerfile.ci‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎emerging_optimizers/orthogonalized_optimizers/muon.py‎
Lines changed: 20 additions & 1 deletion b/‎emerging_optimizers/orthogonalized_optimizers/muon.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎emerging_optimizers/orthogonalized_optimizers/muon_utils.py‎
Lines changed: 38 additions & 3 deletions b/‎emerging_optimizers/orthogonalized_optimizers/muon_utils.py‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎emerging_optimizers/triton_kernels/syrk.py‎
Lines changed: 31 additions & 144 deletions b/‎emerging_optimizers/triton_kernels/syrk.py‎
Lines changed: 31 additions & 144 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 7 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎tests/ci/L0_Tests_CPU.sh‎
Lines changed: 8 additions & 5 deletions b/‎tests/ci/L0_Tests_CPU.sh‎
Lines changed: 8 additions & 5 deletions
@@ -34,7 +34,6 @@ RUN --mount=type=bind,source=pyproject.toml,target=/workspace/pyproject.toml \
     uv sync --link-mode symlink --locked --all-groups \
         --no-install-package absl-py \
         --no-install-package torch \
-        --no-install-package triton \
         --no-install-package nvidia-cublas-cu12 \
         --no-install-package nvidia-cuda-cupti-cu12 \
         --no-install-package nvidia-cuda-nvrtc-cu12 \
 
@@ -16,8 +16,10 @@
 from typing import Callable
 
 import torch
+from absl import logging
 from torch.optim.optimizer import ParamsT
 
+from emerging_optimizers import triton_kernels
 from emerging_optimizers.orthogonalized_optimizers.muon_utils import newton_schulz
 from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import OrthogonalizedOptimizer, _args_doc
 
@@ -56,6 +58,7 @@ class Muon(OrthogonalizedOptimizer):
         num_ns_steps: The number of iteration steps to use in the Newton-Schulz iteration.
         scale_mode: The type of scale factor to use for the update. Defaults to "spectral" style scaling.
         extra_scale_factor: The additional scale factor to use for the update.
+        use_syrk: Whether to use the Triton kernel for the Newton-Schulz iteration.
     """
 
     def __init__(
@@ -74,11 +77,27 @@ def __init__(
         num_ns_steps: int = 5,
         scale_mode: str = "spectral",
         extra_scale_factor: float = 1.0,
+        use_syrk: bool = False,
     ) -> None:
         if num_ns_steps < 1:
             raise ValueError(f"num_ns_steps must be at least 1, got {num_ns_steps}")
 
-        orthogonalize_fn = partial(newton_schulz, steps=num_ns_steps, coefficient_type=coefficient_type)
+        if use_syrk:
+            if torch.cuda.is_available():
+                sm_version = torch.cuda.get_device_capability()
+            else:
+                sm_version = (0, 0)
+            if not triton_kernels.HAS_TRITON_340:  # type: ignore[attr-defined]
+                logging.error("Triton 3.4.0 or higher is required for use_syrk to be True.")
+                use_syrk = False
+            elif sm_version not in ((8, 0), (9, 0), (10, 0), (10, 3)):
+                logging.error(
+                    f"Correctness of Triton kernel on SM {sm_version} cannot be guaranteed. Setting use_syrk to False."
+                )
+                use_syrk = False
+        orthogonalize_fn = partial(
+            newton_schulz, steps=num_ns_steps, coefficient_type=coefficient_type, use_syrk=use_syrk
+        )
         scale_factor_fn = partial(get_muon_scale_factor, mode=scale_mode, extra_scale_factor=extra_scale_factor)
 
         super().__init__(
 
@@ -17,6 +17,8 @@
 import torch
 from absl import logging
 
+from emerging_optimizers import triton_kernels
+
 
 __all__ = ["newton_schulz", "newton_schulz_tp"]
 
@@ -70,6 +72,7 @@ def newton_schulz(
     eps: float = 1e-7,
     transpose: bool | None = None,
     tp_group: torch.distributed.ProcessGroup | None = None,
+    use_syrk: bool = False,
 ) -> torch.Tensor:
     """Use Newton-Schulz iteration to compute the zeroth power / orthogonalization of x.
 
@@ -97,6 +100,7 @@ def newton_schulz(
         transpose: Whether to transpose the tensor to perform whitening on the smaller dimension.
             If None, will be determined based on the size of the tensor.
         tp_group: The process group for communication if input is distributed.
+        use_syrk: Whether to use the Triton kernel for the Newton-Schulz iteration.
 
     Returns:
         The orthogonalization of x.
@@ -131,6 +135,7 @@ def newton_schulz(
     if steps % len(coefficient_sets) != 0:
         raise ValueError(f"steps ({steps}) must be multiple of len(coefficient_sets) ({len(coefficient_sets)}).")
 
+    ns_step_fn = newton_schulz_step
     # Perform the NS iterations
     if torch.get_float32_matmul_precision() == "medium":
         # PyTorch doesn't really have FP32 I/O BF16 compute kernels for precision "medium"
@@ -140,10 +145,12 @@ def newton_schulz(
         # is always in FP32.
         X = X.to(torch.bfloat16)
         logging.log_first_n(logging.INFO, "Using BF16 I/O kernels for Newton-Schulz iteration.", 1)
+        if use_syrk:
+            ns_step_fn = newton_schulz_step_tsyrk
 
     for i in range(steps):
         a, b, c = coefficient_sets[i % len(coefficient_sets)]
-        X = newton_schulz_step(X, a, b, c, tp_group=tp_group)
+        X = ns_step_fn(X, a, b, c, tp_group=tp_group)
 
     # Convert back to FP32. This is a noop if X is already in FP32.
     X = X.to(torch.float32)
@@ -244,6 +251,34 @@ def newton_schulz_step(
     A = X @ X.mT
     if tp_group is not None:
         torch.distributed.all_reduce(A, op=torch.distributed.ReduceOp.SUM, group=tp_group)
-    B = torch.addmm(A, A, A, beta=b, alpha=c)
-    X = torch.addmm(X, B, X, beta=a, alpha=1.0)
+    B = torch.addmm(A, A, A, alpha=c, beta=b)
+    X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    return X
+
+
+def newton_schulz_step_tsyrk(
+    X: torch.Tensor, a: float, b: float, c: float, tp_group: torch.distributed.ProcessGroup | None = None
+) -> torch.Tensor:
+    """Perform a single Newton-Schulz iteration step.
+
+    This function performs a single Newton-Schulz iteration step using the Triton kernel for extended syrk.
+
+    Arguments:
+        X: The tensor to be orthogonalized. Must be bfloat16.
+        a: The a coefficient.
+        b: The b coefficient.
+        c: The c coefficient.
+        tp_group: The process group to use for the all-reduce.
+
+    Returns:
+        The orthogonalization of X.
+    """
+    assert triton_kernels.HAS_TRITON_340, (  # type: ignore[attr-defined]
+        "Triton version doesn't support tensor descriptor API. Minimum required version is 3.4.0."
+    )
+    A = triton_kernels.tsyrk_ex(X)  # type: ignore[attr-defined]
+    if tp_group is not None:
+        torch.distributed.all_reduce(A, op=torch.distributed.ReduceOp.SUM, group=tp_group)
+    B = triton_kernels.tsyrk_ex(A, A, alpha=c, beta=b)  # type: ignore[attr-defined]
+    X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     return X
@@ -13,143 +13,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # type: ignore
+import sys
+
 import torch
 import triton
 import triton.language as tl
+from absl import logging
 
 
 try:
     from triton.tools.tensor_descriptor import TensorDescriptor
-except ImportError:
-    raise ImportError(
-        f"Triton version ({triton.__version__}) doesn't support tensor descriptor API. Minimum required version is 3.4.0."
-    )
-
-
-__all__ = ["ssyrk", "tsyrk_ex"]
-
-
-@triton.jit
-def cvt_tf32_rn(x: tl.tensor) -> tl.tensor:
-    return tl.inline_asm_elementwise("cvt.rna.tf32.f32 $0, $1;", "=r, r", [x], dtype=tl.float32, is_pure=True, pack=1)
 
+    HAS_TRITON_340 = True
+except ImportError:
+    HAS_TRITON_340 = False
 
-@triton.autotune(
-    configs=[
-        triton.Config({"TILE_N": tn, "TILE_K": tk}, num_warps=nw, num_stages=ns)
-        for tn in (64, 128)
-        for tk in (16, 32, 64)
-        for nw in (4, 8)
-        for ns in (3, 4)
-    ],
-    key=["N", "K", "ALLOW_TF32"],
-)
-@triton.jit
-def syrk_op_n_simple_kernel(
-    c_ptr,
-    a_ptr,
-    N: tl.constexpr,
-    K: tl.constexpr,
-    STRIDE_N: tl.constexpr,
-    STRIDE_K: tl.constexpr,
-    ALLOW_TF32: tl.constexpr,
-    TILE_N: tl.constexpr,
-    TILE_K: tl.constexpr,
-):
-    # receives tensor of shape (N, K)
-    # computes A * A^T (-> produces NxN)
-
-    pid_row = tl.program_id(0)
-    pid_col = tl.program_id(1)
-
-    IS_BELOW_DIAG = pid_row < pid_col
-    IS_ABOVE_DIAG = pid_row > pid_col
-
-    if IS_ABOVE_DIAG:
-        return
-
-    offs_row = pid_row * TILE_N + tl.arange(0, TILE_N)
-    offs_col = pid_col * TILE_N + tl.arange(0, TILE_N)
-    offs_k = tl.arange(0, TILE_K)
-
-    mask_row = offs_row < N
-    mask_col = offs_col < N
-
-    a_ptrs_x = a_ptr + offs_row[:, None] * STRIDE_N + offs_k[None, :] * STRIDE_K
-    a_ptrs_y = a_ptr + offs_col[None, :] * STRIDE_N + offs_k[:, None] * STRIDE_K
-
-    acc = tl.zeros((TILE_N, TILE_N), dtype=tl.float32)
-
-    num_tiles_k = tl.cdiv(K, TILE_K)
-    for k in range(0, num_tiles_k):
-        mask_k = offs_k < K - k * TILE_K
-        mask_x = mask_row[:, None] & mask_k[None, :]
-        mask_y = mask_col[None, :] & mask_k[:, None]
-        x = tl.load(a_ptrs_x, mask=mask_x, other=0.0)
-        y = tl.load(a_ptrs_y, mask=mask_y, other=0.0)
-
-        if ALLOW_TF32 == 0:
-            acc = tl.dot(x, y, acc=acc, input_precision="ieee")
-        elif ALLOW_TF32 == 1:
-            x = cvt_tf32_rn(x)
-            y = cvt_tf32_rn(y)
-            acc = tl.dot(x, y, acc=acc, input_precision="tf32")
-        else:
-            tl.static_assert(False, "Unsupported precision.")
-
-        a_ptrs_x += TILE_K * STRIDE_K
-        a_ptrs_y += TILE_K * STRIDE_K
-
-    # store diagonal or below diagonal values
-    c_ptrs = c_ptr + offs_row[:, None] * N + offs_col[None, :]
-    mask_c = mask_row[:, None] & mask_col[None, :]
-    tl.store(c_ptrs, acc, mask=mask_c)
-
-    # store replicated values above diagonal
-    if IS_BELOW_DIAG:
-        c_ptrs_diag = c_ptr + offs_col[None, :] * N + offs_row[:, None]
-        tl.store(c_ptrs_diag, acc, mask=mask_c)
-
-
-def ssyrk(a: torch.Tensor, trans: bool = False) -> torch.Tensor:
-    """Triton implementation of BLAS ssyrk operation.
-
-    Note:
-        This function assumes row major layout of the input tensor.
-
-    TODO(mstadler): Add support for alpha, beta and c.
-
-    Args:
-        a: Input tensor of shape (N, K) or (K, N)
-        trans: Whether to compute A * A^T (trans=False) or A^T * A (trans=True)
-
-    Returns:
-        Output tensor of shape (N, N)
-    """
-    assert a.dim() == 2, "Input tensor must be 2D"
-    N, K = a.shape
-    if trans:
-        raise NotImplementedError("Transpose is not supported yet.")
-
-    STRIDE_N = a.stride(0)
-    STRIDE_K = a.stride(1)
-
-    if (fp32_matmul_prec := torch.get_float32_matmul_precision()) == "highest":
-        ALLOW_TF32 = 0
-    elif fp32_matmul_prec == "high":
-        ALLOW_TF32 = 1
-    else:
-        raise ValueError(f"Unsupported precision {fp32_matmul_prec}, only 'highest' and 'high' are supported.")
-
-    c = torch.empty((N, N), dtype=a.dtype, device=a.device)
-
-    def grid(META):
-        return (triton.cdiv(N, META["TILE_N"]), triton.cdiv(N, META["TILE_N"]))
-
-    if not trans:
-        syrk_op_n_simple_kernel[grid](c, a, N, K, STRIDE_N, STRIDE_K, ALLOW_TF32)
 
-    return c
+__all__ = ["tsyrk_ex", "HAS_TRITON_340"]
 
 
 def prune_invalid_configs(configs: list[triton.Config], named_args: dict, **kwargs) -> list[triton.Config]:
@@ -199,23 +79,30 @@ def matmul_tma_set_block_size_hook(nargs: dict) -> None:
     nargs["d_t_desc"].block_shape = [TILE_N, TILE_M]
 
 
+_CONFIGS = [
+    triton.Config(
+        {"TILE_M": tm, "TILE_N": tn, "TILE_K": tk, "GROUP_SIZE_M": gm},
+        num_warps=nw,
+        num_stages=ns,
+        num_ctas=nc,
+        pre_hook=matmul_tma_set_block_size_hook,
+    )
+    for tm in (64, 128, 256)
+    for tn in (64, 128, 256)
+    for tk in (64, 128, 256)
+    for gm in (2, 4, 8)
+    for nw in (4, 8)
+    for ns in (2, 3, 4)
+    for nc in (1,)
+]
+
+if "absl.testing" in sys.modules.keys():
+    logging.warning("Running in absl.testing mode, disable autotune for triton.")
+    _CONFIGS = _CONFIGS[:1]
+
+
 @triton.autotune(
-    configs=[
-        triton.Config(
-            {"TILE_M": tm, "TILE_N": tn, "TILE_K": tk, "GROUP_SIZE_M": gm},
-            num_warps=nw,
-            num_stages=ns,
-            num_ctas=nc,
-            pre_hook=matmul_tma_set_block_size_hook,
-        )
-        for tm in (64, 128, 256)
-        for tn in (64, 128, 256)
-        for tk in (64, 128, 256)
-        for gm in (2, 4, 8)
-        for nw in (4, 8)
-        for ns in (2, 3, 4)
-        for nc in (1,)
-    ],
+    configs=_CONFIGS,
     key=["N", "K", "TRANS", "WARP_SPECIALIZE"],
     prune_configs_by={"early_config_prune": prune_invalid_configs},
 )
@@ -316,7 +203,7 @@ def tsyrk_ex(
     Returns:
         Output tensor of shape (N, N)
     """
-
+    assert a.dtype == torch.bfloat16, "Input tensor must be bfloat16"
     assert a.dim() == 2, "Input tensor must be 2D"
     assert a.is_contiguous() or a.T.is_contiguous(), "invalid input tensor layout. a or a.T must be contiguous."
 
 
@@ -79,11 +79,13 @@ test = [
     "coverage>=7.8.1",
     "flake8>=7.2.0",
     "pylint>=3.3.7",
+    "triton>=3.4.0",
 ]
 dev = [
     "pre-commit>=3.6.0",
     "ruff>=0.9.9",
     "mypy>=1.8.0",
+    "triton>=3.4.0",
 ]
 
 [tool.uv]
@@ -169,6 +171,10 @@ omit = ["/tmp/*"]
 relative_files = true
 source = ["emerging_optimizers"]
 
-
 [tool.coverage.paths]
 source = ["emerging_optimizers/", "/workspace/emerging_optimizers"]
+
+[too.coverage.report]
+exclude_also = [
+    "@triton"
+]
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 export TORCH_COMPILE_DISABLE=1
-set -o pipefail
-torchrun --nproc_per_node=8 --no-python coverage run -p tests/test_distributed_muon_utils_cpu.py
-torchrun --nproc_per_node=4 --no-python coverage run -p tests/test_distributed_muon_utils_cpu.py
-coverage run -p --source=emerging_optimizers tests/test_scalar_optimizers.py --device=cpu 
-coverage run -p --source=emerging_optimizers tests/test_procrustes_step.py --device=cpu
+
+error=0
+torchrun --nproc_per_node=8 --no-python coverage run -p tests/test_distributed_muon_utils_cpu.py || error=1
+torchrun --nproc_per_node=4 --no-python coverage run -p tests/test_distributed_muon_utils_cpu.py || error=1
+coverage run -p --source=emerging_optimizers tests/test_scalar_optimizers.py --device=cpu || error=1
+coverage run -p --source=emerging_optimizers tests/test_procrustes_step.py --device=cpu || error=1
+
+exit "${error}"