port test over

skyw · skyw · commit fcd5b0185740 · 2025-09-15T14:28:03.000-07:00
Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -42,7 +42,7 @@ jobs:
     uses: ./.github/workflows/_build_container.yml
     needs: cicd-wait-in-queue
     with:
-      image-name: llm_shower
+      image-name: emerging_optimizers
       dockerfile: docker/Dockerfile.ci
       runner: self-hosted-nemo
     secrets:
@@ -72,7 +72,7 @@ jobs:
           script: ${{ matrix.script }}
           timeout: ${{ matrix.timeout || 10 }}
           is_unit_test: "true"
-          image: llm_shower
+          image: emerging_optimizers
           cpu-only: ${{ matrix.cpu-only || false }}
           has-azure-credentials: "true"
           azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
@@ -100,7 +100,7 @@ jobs:
           runner: ${{ runner.name }}
           script: ${{ matrix.script }}
           timeout: ${{ matrix.timeout || 10 }}
-          image: llm_shower
+          image: emerging_optimizers
           cpu-only: ${{ matrix.cpu-only || false }}
           has-azure-credentials: "true"
           azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
@@ -25,8 +25,8 @@ ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
 
 WORKDIR /workspace
 RUN --mount=type=bind,source=pyproject.toml,target=/workspace/pyproject.toml \
-    --mount=type=bind,source=llm_shower/__init__.py,target=/workspace/llm_shower/__init__.py \
-    --mount=type=bind,source=llm_shower/package_info.py,target=/workspace/llm_shower/package_info.py \
+    --mount=type=bind,source=emerging_optimizers/__init__.py,target=/workspace/emerging_optimizers/__init__.py \
+    --mount=type=bind,source=emerging_optimizers/package_info.py,target=/workspace/emerging_optimizers/package_info.py \
     --mount=type=bind,source=uv.lock,target=/workspace/uv.lock bash -exu <<"EOF"
 
     # Use the container's torch installation rather than reinstall it
diff --git a/emerging_optimizers/orthogonalized_optimizers/__init__.py b/emerging_optimizers/orthogonalized_optimizers/__init__.py
@@ -12,5 +12,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import *
 from emerging_optimizers.orthogonalized_optimizers.muon import *
+from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import *
diff --git a/emerging_optimizers/orthogonalized_optimizers/muon.py b/emerging_optimizers/orthogonalized_optimizers/muon.py
@@ -12,14 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable
 from functools import partial
+from typing import Callable
 
 import torch
 from torch.optim.optimizer import ParamsT
 
-from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import OrthogonalizedOptimizer, _args_doc
 from emerging_optimizers.orthogonalized_optimizers.muon_utils import newton_schulz
+from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import OrthogonalizedOptimizer, _args_doc
 
 
 class Muon(OrthogonalizedOptimizer):
diff --git a/emerging_optimizers/orthogonalized_optimizers/muon_utils.py b/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
@@ -17,6 +17,7 @@
 import torch
 from absl import logging
 
+
 __all__ = ["newton_schulz", "newton_schulz_tp"]
 
 _COEFFICIENT_SETS = {
@@ -216,3 +217,33 @@ def newton_schulz_tp(
         raise ValueError(f"Invalid mode: {mode}")
 
     return output
+
+
+def newton_schulz_step(
+    X: torch.Tensor, a: float, b: float, c: float, tp_group: torch.distributed.ProcessGroup | None = None
+) -> torch.Tensor:
+    """Perform a single Newton-Schulz iteration step.
+
+    This function performs a single Newton-Schulz iteration step. It supports distributed input that's sharded
+    along the smaller (orthogonalize) dimension.
+
+    Warning:
+        If distributed, this function doesn't have the information to verify that X is sharded along the smaller
+        (orthogonalize) dimension. It is user's responsibility to ensure that X is sharded correctly.
+
+    Arguments:
+        X: The tensor to be orthogonalized.
+        a: The a coefficient.
+        b: The b coefficient.
+        c: The c coefficient.
+        tp_group: The process group to use for the all-reduce.
+
+    Returns:
+        The orthogonalization of X.
+    """
+    A = X @ X.mT
+    if tp_group is not None:
+        torch.distributed.all_reduce(A, op=torch.distributed.ReduceOp.SUM, group=tp_group)
+    B = torch.addmm(A, A, A, beta=b, alpha=c)
+    X = torch.addmm(X, B, X, beta=a, alpha=1.0)
+    return X
diff --git a/emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py b/emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py
@@ -14,10 +14,9 @@
 # limitations under the License.
 from typing import Any, Callable, override
 
-from absl import logging
-
 import torch
 import torch.optim as optim
+from absl import logging
 from torch.optim.optimizer import ParamsT
 
 from emerging_optimizers import utils
diff --git a/emerging_optimizers/utils/__init__.py b/emerging_optimizers/utils/__init__.py
@@ -12,15 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .eig import *
-
 from contextlib import contextmanager
 from typing import Generator
+
 import torch
 
-__all__ = [
-    "fp32_matmul_precision", "get_pg_size", "get_pg_rank"
-]
+
+__all__ = ["fp32_matmul_precision", "get_pg_size", "get_pg_rank"]
 
 
 @contextmanager
diff --git a/pyproject.toml b/pyproject.toml
@@ -194,6 +194,9 @@ line-ending = "auto"
 [tool.coverage.run]
 concurrency = ["thread", "multiprocessing"]
 omit = ["/tmp/*"]
+relative_files = true
+source = ["emerging_optimizers"]
+
 
 [tool.coverage.paths]
-source = ["llm_shower/", "/workspace/llm_shower"]
+source = ["emerging_optimizers/", "/workspace/emerging_optimizers"]
diff --git a/tests/test_distributed_muon_utils_cpu.py b/tests/test_distributed_muon_utils_cpu.py
@@ -0,0 +1,202 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import numpy as np
+import torch
+from absl.testing import absltest, parameterized
+
+from emerging_optimizers.orthogonalized_optimizers import muon_utils
+
+
+class DistributedNewtonSchulzStepCpuTest(parameterized.TestCase):
+    def setUp(self):
+        self.coefs = 3.4445, -4.7750, 2.0315
+
+    @parameterized.parameters(
+        {"shape": (21, 16)},
+        {"shape": (16, 32)},
+    )
+    def test_close_to_non_distributed(self, shape):
+        x = torch.nn.functional.normalize(torch.randint(-5, 5, shape, device="cpu", dtype=torch.float32), dim=(-2, -1))
+        # All-reduce ensures that every rank gets the same x
+        torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        local_x = x.chunk(world_size, dim=1)[rank]
+
+        dist_out = muon_utils.newton_schulz_step(local_x, *self.coefs, tp_group=torch.distributed.group.WORLD)
+
+        ref_out = muon_utils.newton_schulz_step(x, *self.coefs)
+
+        torch.testing.assert_close(ref_out.chunk(world_size, dim=1)[rank], dist_out)
+
+    @absltest.skipIf(int(os.environ.get("WORLD_SIZE", 1)) < 4, "test requires at least 2 ranks")
+    @parameterized.product(
+        shape=((21, 16), (16, 32)),
+        tp_size=(2, 4),
+    )
+    def test_with_partial_tp(self, shape, tp_size):
+        x = torch.nn.functional.normalize(torch.randint(-5, 5, shape, device="cpu", dtype=torch.float32), dim=(-2, -1))
+        # All-reduce ensures that every rank gets the same x
+        torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+
+        num_tp_groups = torch.distributed.get_world_size() // tp_size
+        tp_group, _ = torch.distributed.new_subgroups_by_enumeration(
+            np.split(np.arange(torch.distributed.get_world_size()), num_tp_groups)
+        )
+        assert tp_group.size() == tp_size
+        local_x = x.chunk(tp_group.size(), dim=1)[tp_group.rank()]
+
+        dist_out = muon_utils.newton_schulz_step(local_x, *self.coefs, tp_group=tp_group)
+        ref_out = muon_utils.newton_schulz_step(x, *self.coefs)
+        torch.testing.assert_close(ref_out.chunk(tp_group.size(), dim=1)[tp_group.rank()], dist_out)
+
+
+class DistributedNewtonSchulzCpuTest(parameterized.TestCase):
+    @parameterized.parameters(
+        {"shape": (21, 16)},
+        {"shape": (16, 32)},
+    )
+    def test_distributed_normalize_close_to_non_distributed(self, shape):
+        x = torch.randint(-5, 5, shape, device="cpu", dtype=torch.float32)
+        torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        local_x = x.chunk(world_size, dim=1)[rank]
+
+        dist_out = muon_utils.distributed_normalize_p2(local_x, eps=1e-7, group=torch.distributed.group.WORLD)
+        ref_out = torch.nn.functional.normalize(x, dim=(-2, -1), eps=1e-7)
+
+        torch.testing.assert_close(ref_out.chunk(world_size, dim=1)[rank], dist_out)
+
+    @parameterized.parameters(
+        {"shape": (3, 32)},
+        {"shape": (5, 100)},
+    )
+    def test_1step_close_to_non_distributed(self, shape):
+        x = torch.randint(-5, 5, shape, device="cpu", dtype=torch.float32)
+        # All-reduce ensures that every rank gets the same x
+        torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        local_x = x.chunk(world_size, dim=1)[rank]
+
+        dist_out = muon_utils.newton_schulz(
+            local_x, steps=1, coefficient_type="simple", tp_group=torch.distributed.group.WORLD
+        )
+        ref_out = muon_utils.newton_schulz(x, steps=1, coefficient_type="simple")
+        torch.testing.assert_close(ref_out.chunk(world_size, dim=1)[rank], dist_out)
+
+    @parameterized.parameters(
+        {"shape": (32, 3), "transpose": True},
+        {"shape": (5, 100), "transpose": False},
+    )
+    def test_5steps_with_transpose_close_to_non_distributed(self, shape, transpose):
+        x = torch.randint(-5, 5, shape, device="cpu", dtype=torch.float32)
+        # All-reduce ensures that every rank gets the same x
+        torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+
+        chunk_dim = 0 if transpose else 1
+        local_x = x.chunk(world_size, dim=chunk_dim)[rank]
+
+        dist_out = muon_utils.newton_schulz(
+            local_x, steps=5, tp_group=torch.distributed.group.WORLD, transpose=transpose
+        )
+        ref_out = muon_utils.newton_schulz(x, steps=5, transpose=transpose)
+        torch.testing.assert_close(ref_out.chunk(world_size, dim=chunk_dim)[rank], dist_out)
+
+    @parameterized.parameters(
+        {"shape": (32, 3), "transpose": True, "tp_size": 2},
+        {"shape": (5, 100), "transpose": False, "tp_size": 4},
+    )
+    def test_1step_with_partial_tp_close_to_non_distributed(self, shape, transpose, tp_size):
+        x = torch.randint(-5, 5, shape, device="cpu", dtype=torch.float32)
+        # All-reduce ensures that every rank gets the same x
+        torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+
+        num_tp_groups = torch.distributed.get_world_size() // tp_size
+        tp_group, _ = torch.distributed.new_subgroups_by_enumeration(
+            np.split(np.arange(torch.distributed.get_world_size()), num_tp_groups)
+        )
+        assert tp_group.size() == tp_size
+
+        chunk_dim = 0 if transpose else 1
+        local_x = x.chunk(tp_group.size(), dim=chunk_dim)[tp_group.rank()]
+
+        dist_out = muon_utils.newton_schulz(
+            local_x, steps=1, coefficient_type="simple", tp_group=tp_group, transpose=transpose
+        )
+        ref_out = muon_utils.newton_schulz(x, steps=1, coefficient_type="simple", transpose=transpose)
+        torch.testing.assert_close(ref_out.chunk(tp_group.size(), dim=chunk_dim)[tp_group.rank()], dist_out)
+
+
+class TestTensorParallelNewtonSchulz(parameterized.TestCase):
+    @parameterized.parameters(
+        {"shape": (21, 16)},
+        {"shape": (16, 32)},
+    )
+    def test_fall_back_to_non_tp(self, shape):
+        x = torch.randint(-5, 5, shape, device="cpu", dtype=torch.float32)
+
+        test_out = muon_utils.newton_schulz_tp(
+            x, steps=5, coefficient_type="quintic", partition_dim=None, tp_group=None
+        )
+        ref_out = muon_utils.newton_schulz(x, steps=5, coefficient_type="quintic")
+
+        torch.testing.assert_close(test_out, ref_out, atol=0, rtol=0)
+
+    @parameterized.product(
+        shape=((20, 16), (16, 32)),
+        partition_dim=(0, 1),
+        mode=("distributed", "duplicated"),
+    )
+    def test_1step_close_to_non_distributed(self, shape, partition_dim, mode):
+        if shape[partition_dim] % torch.distributed.get_world_size() != 0:
+            self.skipTest("Skipping because incompatible shape and world size")
+        x = torch.randint(-5, 5, shape, device="cpu", dtype=torch.float32)
+        # All-reduce ensures that every rank gets the same x
+        torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        local_x = x.chunk(world_size, dim=partition_dim)[rank]
+
+        dist_out = muon_utils.newton_schulz_tp(
+            local_x,
+            steps=1,
+            coefficient_type="simple",
+            tp_group=torch.distributed.group.WORLD,
+            partition_dim=partition_dim,
+            mode=mode,
+        )
+
+        ref_out = muon_utils.newton_schulz(x, steps=1, coefficient_type="simple")
+
+        torch.testing.assert_close(ref_out.chunk(world_size, dim=partition_dim)[rank], dist_out, atol=1e-6, rtol=0)
+
+
+if __name__ == "__main__":
+    torch.distributed.init_process_group(backend="gloo")
+    torch.set_float32_matmul_precision("highest")
+    absltest.main()
+
+    torch.distributed.destroy_process_group()
diff --git a/tests/test_muon_utils.py b/tests/test_muon_utils.py
@@ -14,14 +14,13 @@
 # limitations under the License.
 import math
 
-from absl import logging
-from absl.testing import parameterized, absltest
-
 import torch
+from absl import logging
+from absl.testing import absltest, parameterized
 
-from llm_shower.orthogonalized_optimizers.muon_utils import newton_schulz, _COEFFICIENT_SETS
-from llm_shower.orthogonalized_optimizers.muon import Muon, get_muon_scale_factor
-from llm_shower import utils
+from emerging_optimizers import utils
+from emerging_optimizers.orthogonalized_optimizers.muon import Muon, get_muon_scale_factor
+from emerging_optimizers.orthogonalized_optimizers.muon_utils import _COEFFICIENT_SETS, newton_schulz
 
 
 def newton_schulz_ref(x: torch.Tensor, coefficient_sets: list[tuple[float, float, float]]) -> torch.Tensor:
diff --git a/tests/test_muon_utils_tp_cpu.py b/tests/test_muon_utils_tp_cpu.py
diff --git a/tests/test_orthogonalized_optimizer.py b/tests/test_orthogonalized_optimizer.py
diff --git a/tests/unit_tests/L0_Unit_Tests_CPU.sh b/tests/unit_tests/L0_Unit_Tests_CPU.sh
diff --git a/tests/unit_tests/L0_Unit_Tests_GPU.sh b/tests/unit_tests/L0_Unit_Tests_GPU.sh