support polar grad scale in mop

skyw · skyw · commit 0eaad00517ca · 2025-12-11T10:26:26.000-08:00
Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/emerging_optimizers/orthogonalized_optimizers/adaptive_muon.py b/emerging_optimizers/orthogonalized_optimizers/adaptive_muon.py
@@ -26,10 +26,10 @@
 
 from emerging_optimizers import mixin as opt_mixin
 from emerging_optimizers import utils
-from emerging_optimizers.orthogonalized_optimizers.muon import Muon
+from emerging_optimizers.orthogonalized_optimizers import muon
 
 
-class AdaptiveMuon(Muon):
+class AdaptiveMuon(muon.Muon):
     """Adaptive Muon optimizer with adaptive second moment (AdaMuon/NorMuon variants).
 
     This class extends Muon by adding AdamW-style or NorMuon-style second moment
@@ -68,7 +68,7 @@ def __init__(
         fp32_matmul_prec: str,
         coefficient_type: str = "quintic",
         num_ns_steps: int = 5,
-        scale_mode: str = "spectral",
+        scale_mode: muon.MuonScaleT = "spectral",
         extra_scale_factor: float = 1.0,
         use_syrk: bool = False,
         moment2_method: Literal["adamuon", "normuon"] = "adamuon",
diff --git a/emerging_optimizers/orthogonalized_optimizers/mop.py b/emerging_optimizers/orthogonalized_optimizers/mop.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Literal, Optional
 
 import torch
 from torch.optim.optimizer import ParamsT
@@ -36,7 +36,7 @@ class MOP(OrthogonalizedOptimizer):
 
     Args:
         {_args_doc}
-        scale_mode: The type of scale factor to use for the update. Defaults to "spectral" style scaling.
+        scale_mode: The type of scale factor to use for the update. Defaults to nuclear_norm style scaling.
         extra_scale_factor: The additional scale factor to use for the update.
     """
 
@@ -50,21 +50,25 @@ def __init__(
         use_nesterov: bool = False,
         weight_decay_method: WeightDecayT = "decoupled",
         fp32_matmul_prec: str = "highest",
-        scale_mode: str = "spectral",
+        scale_mode: muon.MuonScaleT | Literal["nuclear_norm"] = "nuclear_norm",
         extra_scale_factor: float = 1.0,
     ) -> None:
         def scaled_orthogonalize_fn(grad: torch.Tensor) -> torch.Tensor:
-            orth_grad, _ = polar_via_svd(grad, False)
+            orth_grad, _, S = polar_via_svd(grad, False)
 
-            scale_factor = muon.get_muon_scale_factor(grad.size(-2), grad.size(-1), mode=scale_mode)
+            if scale_mode != "nuclear_norm":
+                scale_factor = muon.get_muon_scale_factor(grad.size(-2), grad.size(-1), mode=scale_mode)
+            else:
+                # nuclear norm scaling suggested by PolarGrad paper (https://arxiv.org/pdf/2505.21799)
+                scale_factor = S.sum().sqrt()
             return orth_grad * scale_factor * extra_scale_factor
 
         super().__init__(
             params,
             lr,
             momentum_beta,
+            weight_decay,
             use_nesterov=use_nesterov,
-            weight_decay=weight_decay,
             weight_decay_method=weight_decay_method,
             fp32_matmul_prec=fp32_matmul_prec,
             scaled_orthogonalize_fn=scaled_orthogonalize_fn,
@@ -74,7 +78,9 @@ def scaled_orthogonalize_fn(grad: torch.Tensor) -> torch.Tensor:
 MOP.__doc__ = MOP.__doc__.format(_args_doc=_args_doc)  # type: ignore[union-attr]
 
 
-def polar_via_svd(A: torch.Tensor, return_p: bool = False) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+def polar_via_svd(
+    A: torch.Tensor, return_p: bool = False
+) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
     """Compute polar decomposition via SVD
 
     Args:
@@ -87,12 +93,13 @@ def polar_via_svd(A: torch.Tensor, return_p: bool = False) -> tuple[torch.Tensor
         A tuple containing:
             - The unitary part of the polar decomposition.
             - The positive-semidefinite part of the polar decomposition, if return_p is True.
+            - The singular values of the input tensor.
     """
     U_svd, S, Vh = torch.linalg.svd(A, full_matrices=False)
     U_polar = U_svd @ Vh
 
     if not return_p:
-        return U_polar, None
+        return U_polar, None, S
     else:
         p = Vh.mH @ torch.diag(S) @ Vh
-        return U_polar, p
+        return U_polar, p, S
diff --git a/emerging_optimizers/orthogonalized_optimizers/muon.py b/emerging_optimizers/orthogonalized_optimizers/muon.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Literal
+
 import torch
 from absl import logging
 from torch.optim.optimizer import ParamsT
@@ -23,6 +25,9 @@
 from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import OrthogonalizedOptimizer, _args_doc
 
 
+MuonScaleT = Literal["shape_scaling", "spectral", "unit_rms_norm"]
+
+
 class Muon(OrthogonalizedOptimizer):
     """Muon: MomentUm Orthogonalized by Newton-schulz
 
@@ -72,7 +77,7 @@ def __init__(
         fp32_matmul_prec: str = "medium",
         coefficient_type: str = "quintic",
         num_ns_steps: int = 5,
-        scale_mode: str = "spectral",
+        scale_mode: MuonScaleT = "spectral",
         extra_scale_factor: float = 1.0,
         use_syrk: bool = False,
     ) -> None:
@@ -122,7 +127,7 @@ def scaled_orthogonalize_fn(grad: torch.Tensor) -> torch.Tensor:
 Muon.__doc__ = Muon.__doc__.format(_args_doc=_args_doc)  # type: ignore[union-attr]
 
 
-def get_muon_scale_factor(size_out: int, size_in: int, mode: str = "spectral") -> float:
+def get_muon_scale_factor(size_out: int, size_in: int, mode: MuonScaleT = "spectral") -> float:
     """Get the scale for the update.
 
     Default mode is "spectral", which is the mode that allows for learning rate transferability from AdamW.
diff --git a/tests/test_orthogonalized_optimizer.py b/tests/test_orthogonalized_optimizer.py
@@ -254,16 +254,18 @@ class MopTest(parameterized.TestCase):
         shape=[(5, 7), (33, 65), (127, 257)],
         weight_decay_method=["decoupled", "independent"],
         use_nesterov=[True, False],
-        extra_scale_factor=[1.0, 2.0],
+        scale_mode=["spectral", "nuclear_norm"],
+        extra_scale_factor=[1.0, 0.2],
     )
-    def test_smoke(self, shape, weight_decay_method, use_nesterov, extra_scale_factor) -> None:
+    def test_smoke(self, shape, weight_decay_method, use_nesterov, scale_mode, extra_scale_factor) -> None:
         test_param = nn.Parameter(torch.randint(-5, 5, shape, dtype=torch.float32, device="cuda"))
         test_param.grad = torch.randint_like(test_param, -5, 5)
 
         mop_opt = mop.MOP(
             [test_param],
             weight_decay_method=weight_decay_method,
             use_nesterov=use_nesterov,
+            scale_mode=scale_mode,
             extra_scale_factor=extra_scale_factor,
         )
         mop_opt.step()