Add Mop (#82)

skyw · web-flow · commit 255bfc66976c · 2025-12-09T22:57:00.000Z
* add mop

Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/docs/apidocs/orthogonalized-optimizers.md b/docs/apidocs/orthogonalized-optimizers.md
@@ -27,6 +27,12 @@ emerging_optimizers.orthogonalized_optimizers
 .. autoclass:: Scion
     :members:
 
+:hidden:`Mop`
+~~~~~~~~~~~~~~~
+
+.. autoclass:: MOP
+    :members:
+
 
 :hidden:`Newton-Schulz`
 ~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/emerging_optimizers/orthogonalized_optimizers/__init__.py b/emerging_optimizers/orthogonalized_optimizers/__init__.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from emerging_optimizers.orthogonalized_optimizers.adaptive_muon import *
+from emerging_optimizers.orthogonalized_optimizers.mop import *
 from emerging_optimizers.orthogonalized_optimizers.muon import *
 from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import *
 from emerging_optimizers.orthogonalized_optimizers.scion import *
diff --git a/emerging_optimizers/orthogonalized_optimizers/mop.py b/emerging_optimizers/orthogonalized_optimizers/mop.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import torch
+from torch.optim.optimizer import ParamsT
+
+from emerging_optimizers.mixin import WeightDecayT
+from emerging_optimizers.orthogonalized_optimizers import muon
+from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import OrthogonalizedOptimizer, _args_doc
+
+
+__all__ = ["MOP"]
+
+
+class MOP(OrthogonalizedOptimizer):
+    """MOP: Momentum Orthogonalized by Polar decomposition
+
+    warning:
+        This optimizer is experimental and not yet thoroughly tested.
+
+
+    Args:
+        {_args_doc}
+        scale_mode: The type of scale factor to use for the update. Defaults to "spectral" style scaling.
+        extra_scale_factor: The additional scale factor to use for the update.
+    """
+
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = 3e-4,
+        momentum_beta: float = 0.95,
+        weight_decay: float = 0.01,
+        *,
+        use_nesterov: bool = False,
+        weight_decay_method: WeightDecayT = "decoupled",
+        fp32_matmul_prec: str = "highest",
+        scale_mode: str = "spectral",
+        extra_scale_factor: float = 1.0,
+    ) -> None:
+        def scaled_orthogonalize_fn(grad: torch.Tensor) -> torch.Tensor:
+            orth_grad, _ = polar_via_svd(grad, False)
+
+            scale_factor = muon.get_muon_scale_factor(grad.size(-2), grad.size(-1), mode=scale_mode)
+            return orth_grad * scale_factor * extra_scale_factor
+
+        super().__init__(
+            params,
+            lr,
+            momentum_beta,
+            use_nesterov=use_nesterov,
+            weight_decay=weight_decay,
+            weight_decay_method=weight_decay_method,
+            fp32_matmul_prec=fp32_matmul_prec,
+            scaled_orthogonalize_fn=scaled_orthogonalize_fn,
+        )
+
+
+MOP.__doc__ = MOP.__doc__.format(_args_doc=_args_doc)  # type: ignore[union-attr]
+
+
+def polar_via_svd(A: torch.Tensor, return_p: bool = False) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """Compute polar decomposition via SVD
+
+    Args:
+        A: The input tensor to compute the polar decomposition of.
+        return_p: Whether to return the positive-semidefinite part of the polar decomposition. p is not needed
+            by the MOP optimizer, so by default it is not calculated to save computation. The option is provided to
+            return full polar decomposition to match the function name.
+
+    Returns:
+        A tuple containing:
+            - The unitary part of the polar decomposition.
+            - The positive-semidefinite part of the polar decomposition, if return_p is True.
+    """
+    U_svd, S, Vh = torch.linalg.svd(A, full_matrices=False)
+    U_polar = U_svd @ Vh
+
+    if not return_p:
+        return U_polar, None
+    else:
+        p = Vh.mH @ torch.diag(S) @ Vh
+        return U_polar, p
diff --git a/tests/test_orthogonalized_optimizer.py b/tests/test_orthogonalized_optimizer.py
@@ -17,7 +17,7 @@
 import torch.nn as nn
 from absl.testing import absltest, parameterized
 
-from emerging_optimizers.orthogonalized_optimizers import muon, scion
+from emerging_optimizers.orthogonalized_optimizers import mop, muon, scion
 from emerging_optimizers.orthogonalized_optimizers.orthogonalized_optimizer import OrthogonalizedOptimizer
 
 
@@ -249,5 +249,25 @@ def test_smoke(self, shape) -> None:
         scion_opt.step()
 
 
+class MopTest(parameterized.TestCase):
+    @parameterized.product(
+        shape=[(5, 7), (33, 65), (127, 257)],
+        weight_decay_method=["decoupled", "independent"],
+        use_nesterov=[True, False],
+        extra_scale_factor=[1.0, 2.0],
+    )
+    def test_smoke(self, shape, weight_decay_method, use_nesterov, extra_scale_factor) -> None:
+        test_param = nn.Parameter(torch.randint(-5, 5, shape, dtype=torch.float32, device="cuda"))
+        test_param.grad = torch.randint_like(test_param, -5, 5)
+
+        mop_opt = mop.MOP(
+            [test_param],
+            weight_decay_method=weight_decay_method,
+            use_nesterov=use_nesterov,
+            extra_scale_factor=extra_scale_factor,
+        )
+        mop_opt.step()
+
+
 if __name__ == "__main__":
     absltest.main()