NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 60 additions & 5 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 60 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/apidocs/soap.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/apidocs/soap.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 1 addition & 0 deletions b/‎docs/conf.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎emerging_optimizers/orthogonalized_optimizers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎emerging_optimizers/orthogonalized_optimizers/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎emerging_optimizers/orthogonalized_optimizers/muon.py‎
Lines changed: 8 additions & 4 deletions b/‎emerging_optimizers/orthogonalized_optimizers/muon.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py‎
Lines changed: 12 additions & 3 deletions b/‎emerging_optimizers/orthogonalized_optimizers/orthogonalized_optimizer.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎emerging_optimizers/orthogonalized_optimizers/spectral_clipping_utils.py‎
Lines changed: 98 additions & 0 deletions b/‎emerging_optimizers/orthogonalized_optimizers/spectral_clipping_utils.py‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎emerging_optimizers/psgd/procrustes_step.py‎
Lines changed: 62 additions & 0 deletions b/‎emerging_optimizers/psgd/procrustes_step.py‎
Lines changed: 62 additions & 0 deletions
@@ -29,10 +29,16 @@ permissions:
   contents: read
 
 jobs:
+  pre-flight:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@main
 
   cicd-wait-in-queue:
+    needs: [pre-flight]
     runs-on: ubuntu-latest
     environment: test
+    if: |
+      needs.pre-flight.outputs.is_ci_workload == 'false'
+      && needs.pre-flight.outputs.docs_only == 'false'
     steps:
       - name: Running CI tests
         run: |
@@ -62,10 +68,17 @@ jobs:
             runner: linux-amd64-cpu16
             timeout: 30
             cpu-only: true
-    needs: [cicd-container-build]
+    needs: [pre-flight, cicd-container-build]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.script }}
     environment: nemo-ci
+    if: |
+      (
+        success()
+        || needs.pre-flight.outputs.is_ci_workload == 'true'
+        || needs.pre-flight.outputs.force_run_all == 'true'
+      )
+      && !cancelled()
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -85,9 +98,17 @@ jobs:
 
   Nemo_CICD_Test:
     needs:
+      - pre-flight
       - cicd-container-build
       - cicd-unit-tests
-    if: always()
+    if: |
+      (
+        needs.pre-flight.outputs.docs_only == 'true'
+        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
+        || needs.pre-flight.outputs.is_ci_workload == 'true'
+        || always()
+      )
+      && !cancelled()
     runs-on: ubuntu-latest
     permissions: write-all
     steps:
@@ -99,13 +120,15 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
           RUN_ID: ${{ github.run_id }}
+          DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }}
+          IS_DEPLOYMENT: ${{ needs.pre-flight.outputs.is_deployment_workflow }}
+          IS_CI_WORKLOAD: ${{ needs.pre-flight.outputs.is_ci_workload }}
         run: |
           # Get workflow run details and check job conclusions
-          LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last')
           NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
           NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
 
-          if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 ]]; then
+          if [[ ($NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0) || $DOCS_ONLY == 'true' || $IS_DEPLOYMENT == 'true' || $IS_CI_WORKLOAD == 'true' ]]; then
             RESULT="success"
           elif [[ $NUM_CANCELLED -gt 0 ]]; then
             RESULT="cancelled"
@@ -180,9 +203,41 @@ jobs:
             exit 1
           fi
 
+  Coverage_Fake:
+    runs-on: ubuntu-latest
+    needs: [Nemo_CICD_Test, pre-flight]
+    if: |
+      (
+        needs.pre-flight.outputs.docs_only == 'true'
+        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
+      )
+      && needs.pre-flight.outputs.is_ci_workload == 'false'
+      && !cancelled()
+    environment: nemo-ci
+    steps:
+      - name: Generate fake coverage report
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{ secrets.PAT }}
+          script: |
+            await github.rest.repos.createCommitStatus({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              sha: context.sha,
+              state: 'success',
+              description: 'No code changes - coverage check skipped',
+              context: 'codecov/patch'
+            });
+
   Coverage:
     runs-on: ubuntu-latest
-    needs: [Nemo_CICD_Test]
+    needs: [pre-flight, Nemo_CICD_Test]
+    if: |
+      (
+        (needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
+        || success()
+      )
+      && !cancelled()
     strategy:
       matrix:
         flag: [unit-test]
 
@@ -26,7 +26,7 @@ Emerging optimizers have demonstrated significant practical impact in large-scal
 
 ### Prerequisites
 
-- Python 3.12 or higher
+- Python 3.10 or higher, 3.12 is recommended
 - PyTorch 2.0 or higher
 
 ### Install from Source
 
@@ -21,4 +21,10 @@ emerging_optimizers.soap
 .. autofunction:: update_kronecker_factors
 
 .. autofunction:: update_eigenbasis_and_momentum
+
+emerging_optimizers.soap.soap_utils
+=====================================
+
+.. automodule:: emerging_optimizers.soap.soap_utils
+    :members:
 ```
@@ -72,6 +72,7 @@
     "numpy": ("https://numpy.org/doc/stable", None),
     "torch": ("https://pytorch.org/docs/2.5", None),
 }
+autodoc_typehints = "description"
 
 
 def linkcode_resolve(domain, info):
 
@@ -14,3 +14,4 @@
 # limitations under the License.
 from emerging_optimizers.orthogonalized_optimizers.muon import *
 from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import *
+from emerging_optimizers.orthogonalized_optimizers.spectral_clipping_utils import *
@@ -37,9 +37,12 @@ class Muon(OrthogonalizedOptimizer):
     optimization via Frank-Wolfe.
 
     References:
-        - Jordan, K. *Muon Optimizer Implementation.* [`GitHub <https://github.com/KellerJordan/Muon/blob/master/muon.py>`_]
-        - *Modular Duality in Deep Learning.* arXiv:2410.21265 (2024). [`arXiv:2410.21265 <https://arxiv.org/abs/2410.21265>`_]
-        - *Training Deep Learning Models with Norm-Constrained LMOs.* arXiv:2502.07529 (2025). [`arXiv:2502.07529 <https://arxiv.org/abs/2502.07529>`_]
+        - Jordan, K. *Muon Optimizer Implementation.*
+          [`GitHub <https://github.com/KellerJordan/Muon/blob/master/muon.py>`_]
+        - *Modular Duality in Deep Learning.* arXiv:2410.21265 (2024).
+          [`arXiv:2410.21265 <https://arxiv.org/abs/2410.21265>`_]
+        - *Training Deep Learning Models with Norm-Constrained LMOs.* arXiv:2502.07529 (2025).
+          [`arXiv:2502.07529 <https://arxiv.org/abs/2502.07529>`_]
 
     Warning:
         - This optimizer requires that all parameters passed in are 2D.
@@ -122,7 +125,8 @@ def get_muon_scale_factor(
         # Suggested by K. Jordan and Kimi (https://arxiv.org/abs/2502.16982)
         return extra_scale_factor * max(size_out, size_in) ** 0.5
     elif mode == "unit_rms_norm":
-        # Suggested by Scion (https://arxiv.org/abs/2502.07529) and Bernstein et al. (https://jeremybernste.in/writing/deriving-muon)
+        # Suggested by Scion (https://arxiv.org/abs/2502.07529) and Bernstein et al.
+        # (https://jeremybernste.in/writing/deriving-muon)
         return extra_scale_factor * (size_out / size_in) ** 0.5
     else:
         raise ValueError(f"Invalid mode for Muon update scale factor: {mode}")
@@ -12,7 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, override
+from typing import Any, Callable
+
+
+# TODO(@boxiangw): remove this once bump to python 3.12
+try:
+    from typing import override
+except ImportError:
+    from typing_extensions import override
 
 import torch
 import torch.optim as optim
@@ -45,9 +52,11 @@ class OrthogonalizedOptimizer(optim.Optimizer):
 
     - Carlson, D., Cevher, V., and Carin, L. *Stochastic spectral descent for Restricted Boltzmann Machines.*
       In International Conference on Artificial Intelligence and Statistics (2015a).
-    - Carlson, D., Hsieh, Y.-P., Collins, E., Carin, L., and Cevher, V. *Stochastic Spectral Descent for Discrete Graphical Models.*
+    - Carlson, D., Hsieh, Y.-P., Collins, E., Carin, L., and Cevher, V.
+      *Stochastic Spectral Descent for Discrete Graphical Models.*
       In IEEE Journal of Selected Topics in Signal Processing, vol. 10, no. 2, pp. 296-311 (2016).
-    - Carlson, D., Collins, E., Hsieh, Y.-P., Carin, L., and Cevher, V. *Preconditioned spectral descent for deep learning.*
+    - Carlson, D., Collins, E., Hsieh, Y.-P., Carin, L., and Cevher, V.
+      *Preconditioned spectral descent for deep learning.*
       In Neural Information Processing Systems (2015b).
     - Flynn, T. *The duality structure gradient descent algorithm: analysis and applications to neural networks.*
       arXiv preprint arXiv:1708.00523 (2017). [`arXiv:1708.00523 <https://arxiv.org/abs/1708.00523>`_]
 
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from emerging_optimizers.orthogonalized_optimizers.muon_utils import newton_schulz
+
+
+__all__ = ["spectral_hardcap", "spectral_clip"]
+
+
+def spectral_clip(X: torch.Tensor, sigma_min: float = -1.0, sigma_max: float = 1.0) -> torch.Tensor:
+    r"""Applies spectral clipping to the input tensor.
+
+    From the idea that clipping can be written using the sign function. This idea can be extended to singular values of matrices
+    using the matrix sign function, computed using Newton-Schulz iteration for efficiency.
+
+    Based on https://leloykun.github.io/ponder/spectral-clipping/.
+
+    Args:
+        X: The input tensor.
+        sigma_min: The minimum singular value.
+        sigma_max: The maximum singular value.
+
+    Returns:
+        The spectral clipped tensor.
+    """
+    if needs_transpose := X.shape[0] > X.shape[1]:
+        X = X.T
+    OX = newton_schulz(X, steps=8, coefficient_type="polar_express")
+    result = (sigma_min + sigma_max) * OX
+    identity_matrix = torch.eye(X.shape[0], device=X.device, dtype=X.dtype)
+    for s, sign in zip([sigma_min, sigma_max], [1, -1]):
+        A = torch.addmm(s * identity_matrix, OX, X.T, beta=1.0, alpha=-1.0)
+        B = torch.add(s * OX, X, alpha=-1)
+        result = torch.addmm(result, newton_schulz(A, steps=8, coefficient_type="polar_express"), B, alpha=sign)
+    result = result * 0.5
+
+    if needs_transpose:
+        result = result.T
+    return result
+
+
+def spectral_hardcap(X: torch.Tensor, beta: float = 1.0) -> torch.Tensor:
+    r"""Spectral hardcap function clips singular values from above to be less than beta.
+
+    Simplifies the spectral clipping function to just an upper bound, resulting in a hardcap.
+    Based on https://leloykun.github.io/ponder/spectral-clipping/.
+
+    Args:
+        X: The input tensor.
+        beta: The upper bound on the singular values.
+
+    Returns:
+        The spectral hardcapped tensor.
+
+    """
+    if needs_transpose := X.shape[0] > X.shape[1]:
+        X = X.T
+    OX = newton_schulz(X, steps=8, coefficient_type="polar_express")
+    aX = torch.add(beta * OX, X, alpha=-1)
+    result = torch.add(beta * OX, X)
+    result = torch.addmm(
+        result, aX, torch.mm(newton_schulz(aX, steps=8, coefficient_type="polar_express").T, OX), alpha=-1
+    )
+    result = result * 0.5
+    if needs_transpose:
+        result = result.T
+    return result
+
+
+def spectral_clipped_weight_decay(X: torch.Tensor, beta: float = 1.0, c: float = 0.5) -> torch.Tensor:
+    r"""Applies weight decay to the input tensor while applying spectral hardcapping.
+
+    This is the spectral version of Euclidean decoupled weight decay (Hanson & Pratt, 1988).
+
+    Based on https://leloykun.github.io/ponder/spectral-clipping/.
+
+    Args:
+        X: The input tensor.
+        beta: The upper bound on the singular values.
+        c: The coefficient parameter.
+
+    Returns:
+        The spectral clipped weight decay tensor.
+    """
+    return torch.add((1 - c) * X, spectral_hardcap(X, beta), alpha=c)
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+import emerging_optimizers.utils as utils
+from emerging_optimizers.psgd.psgd_utils import norm_lower_bound_skew
+
+
+__all__ = [
+    "procrustes_step",
+]
+
+
+@torch.compile  # type: ignore[misc]
+def procrustes_step(Q: torch.Tensor, max_step_size: float = 0.125, eps: float = 1e-8) -> torch.Tensor:
+    r"""One step of an online solver for the orthogonal Procrustes problem.
+
+    The orthogonal Procrustes problem is :math:`\min_U \| U Q - I \|_F` s.t. :math:`U^H U = I`
+    by rotating Q as :math:`\exp(a R) Q`, where :math:`R = Q^H - Q` is the generator and :math:`\|a R\| < 1`.
+
+    `max_step_size` should be less than :math:`1/4` as we only expand :math:`\exp(a R)` to its 2nd order term.
+
+    This method is a second order expansion of a Lie algebra parametrized rotation that
+    uses a simple approximate line search to find the optimal step size, from Xi-Lin Li.
+
+    Args:
+        Q: Tensor of shape (n, n), general square matrix to orthogonalize.
+        max_step_size: Maximum step size for the line search. Default is 1/8. (0.125)
+        eps: Small number for numerical stability.
+    """
+    # Note: this function is written in fp32 to avoid numerical instability while computing the taylor expansion of the exponential map
+    with utils.fp32_matmul_precision("highest"):
+        R = Q.T - Q
+        R /= torch.clamp(norm_lower_bound_skew(R), min=eps)
+        RQ = R @ Q
+        # trace of RQ is always positive,
+        # since tr(RQ) = ⟨R, Q⟩_F = ⟨Q^T - Q, Q⟩_F = ||Q||_F^2 - ⟨Q, Q⟩_F = ||Q||_F^2 - tr(Q^T Q) ≥ 0
+        tr_RQ = torch.trace(RQ)
+        RRQ = R @ RQ
+        tr_RRQ = torch.trace(RRQ)
+        # clip step size to max_step_size, based on a 2nd order expansion.
+        _step_size = torch.clamp(-tr_RQ / tr_RRQ, min=0, max=max_step_size)
+        # If tr_RRQ >= 0, the quadratic approximation is not concave, we fallback to max_step_size.
+        step_size = torch.where(tr_RRQ < 0, _step_size, max_step_size)
+        # rotate Q as exp(a R) Q ~ (I + a R + a^2 R^2/2) Q with an optimal step size by line search
+        # for 2nd order expansion, only expand exp(a R) to its 2nd term.
+        # Q += step_size * (RQ + 0.5 * step_size * RRQ)
+        Q = torch.add(Q, torch.add(RQ, RRQ, alpha=0.5 * step_size), alpha=step_size)
+
+    return Q
Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@`
`72`	`72`	`"numpy": ("https://numpy.org/doc/stable", None),`
`73`	`73`	`"torch": ("https://pytorch.org/docs/2.5", None),`
`74`	`74`	`}`
	`75`	`+autodoc_typehints = "description"`
`75`	`76`
`76`	`77`
`77`	`78`	`def linkcode_resolve(domain, info):`