Merge pull request #356 from kozistr/update/muon-optimizer

kozistr · web-flow · commit b85065d9d44d · 2025-02-24T19:40:07.000+09:00
[Update] Muon optimizer
diff --git a/docs/changelogs/v3.4.3.md b/docs/changelogs/v3.4.3.md
@@ -1,5 +1,11 @@
 ### Change Log
 
+### Update
+
+* Update Muon optimizer. (#355, #356)
+    * support decoupled weight decay.
+    * adjust default hyperparameters same with the original implementation.
+    * support adjusted lr from the Moonlight. you can use it by setting `use_adjusted_lr=True`.
 
 ### Fix
 
diff --git a/docs/visualization.md b/docs/visualization.md
@@ -234,10 +234,6 @@
 
 ![image](https://raw.githubusercontent.com/kozistr/pytorch_optimizer/main/docs/visualizations/rastrigin_MSVAG.png)
 
-### Muon
-
-![image](https://raw.githubusercontent.com/kozistr/pytorch_optimizer/main/docs/visualizations/rastrigin_Muon.png)
-
 ### Nero
 
 ![image](https://raw.githubusercontent.com/kozistr/pytorch_optimizer/main/docs/visualizations/rastrigin_Nero.png)
@@ -604,10 +600,6 @@
 
 ![image](https://raw.githubusercontent.com/kozistr/pytorch_optimizer/main/docs/visualizations/rosenbrock_MSVAG.png)
 
-### Muon
-
-![image](https://raw.githubusercontent.com/kozistr/pytorch_optimizer/main/docs/visualizations/rosenbrock_Muon.png)
-
 ### Nero
 
 ![image](https://raw.githubusercontent.com/kozistr/pytorch_optimizer/main/docs/visualizations/rosenbrock_Nero.png)
diff --git a/docs/visualizations/rastrigin_Muon.png b/docs/visualizations/rastrigin_Muon.png
diff --git a/docs/visualizations/rosenbrock_Muon.png b/docs/visualizations/rosenbrock_Muon.png
diff --git a/examples/visualize_optimizers.py b/examples/visualize_optimizers.py
@@ -16,7 +16,7 @@
 
 filterwarnings('ignore', category=UserWarning)
 
-OPTIMIZERS_IGNORE = ('lomo', 'adalomo', 'demo', 'a2grad')
+OPTIMIZERS_IGNORE = ('lomo', 'adalomo', 'demo', 'a2grad', 'muon')
 OPTIMIZERS_MODEL_INPUT_NEEDED = ('lomo', 'adalomo', 'adammini')
 OPTIMIZERS_GRAPH_NEEDED = ('adahessian', 'sophiah')
 OPTIMIZERS_CLOSURE_NEEDED = ('alig', 'bsam')
@@ -93,10 +93,6 @@
         'lr': hp.uniform('lr', 0, 0.8),
         'momentum': hp.quniform('momentum', 0, 0.99, 0.01),
     },
-    'muon': {
-        'lr': hp.uniform('lr', 0, 0.8),
-        'momentum': hp.quniform('momentum', 0, 0.99, 0.01),
-    },
 }
 
 
diff --git a/pytorch_optimizer/optimizer/muon.py b/pytorch_optimizer/optimizer/muon.py
@@ -1,3 +1,4 @@
+import math
 import os
 from typing import List, Optional
 
@@ -11,45 +12,57 @@
 
 
 class Muon(BaseOptimizer):
-    r"""MomentUm Orthogonalized by Newton-schulz.
+    r"""Momentum Orthogonalized by Newton-schulz.
 
     Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-processing step, in which
     each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each
     update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU.
 
+    Muon is intended to optimize only the internal ≥2D parameters of a network. Embeddings, classifier heads, and
+    scalar or vector parameters should be optimized using AdamW.
+
     Some warnings:
     - We believe this optimizer is unlikely to work well for training with small batch size.
     - We believe it may not work well for fine-tuning pretrained models, but we haven't tested this.
 
     :param params: PARAMETERS. the parameters to be optimized by Muon.
     :param lr: float. learning rate.
     :param momentum: float. the momentum used by the internal SGD.
+    :param weight_decay: float. weight decay (L2 penalty).
+    :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW.
     :param betas: The betas for the internal AdamW.
     :param nesterov: bool. whether to use nesterov momentum.
-    :param ns_steps: int. the number of Newton-Schulz iterations to run. (6 is probably always enough)
-    :param adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are {0, 1}-D or
-        are detected as being the embed or lm_head will be optimized by AdamW as well.
-    :param adamw_lr: The learning rate for the internal AdamW.
-    :param adamw_wd: The weight decay for the internal AdamW.
-    :param adamw_eps: The epsilon for the internal AdamW.
+    :param ns_steps: int. the number of Newton-Schulz iterations to run. (5 is probably always enough)
+    :param use_adjusted_lr: bool. whether to use adjusted learning rate, which is from the Moonlight.
+        reference: https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
+    :param adamw_params: Optional[PARAMETERS] The parameters to be optimized by AdamW. Any parameters in `muon_params`
+        which are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well. It'd be
+        better to create AdamW optimizer instead of using this.
+    :param adamw_lr: float. The learning rate for the internal AdamW.
+    :param adamw_wd: float. The weight decay for the internal AdamW.
+    :param adamw_eps: float. The epsilon for the internal AdamW.
     """
 
     def __init__(
         self,
         params: PARAMETERS,
         lr: float = 2e-2,
         momentum: float = 0.95,
-        betas: BETAS = (0.95, 0.95),
+        weight_decay: float = 1e-2,
+        weight_decouple: bool = True,
+        betas: BETAS = (0.9, 0.95),
         nesterov: bool = True,
-        ns_steps: int = 6,
+        ns_steps: int = 5,
+        use_adjusted_lr: bool = False,
         adamw_params: Optional[PARAMETERS] = None,
         adamw_lr: float = 3e-4,
-        adamw_wd: float = 0,
+        adamw_wd: float = 0.0,
         adamw_eps: float = 1e-8,
         **kwargs,
     ):
         self.validate_learning_rate(lr)
         self.validate_learning_rate(adamw_lr)
+        self.validate_non_negative(weight_decay, 'weight_decay')
         self.validate_range(momentum, 'momentum', 0.0, 1.0, range_type='[)')
         self.validate_positive(ns_steps, 'ns_steps')
         self.validate_betas(betas)
@@ -66,8 +79,11 @@ def __init__(
         defaults: DEFAULTS = {
             'lr': lr,
             'momentum': momentum,
+            'weight_decay': weight_decay,
+            'weight_decouple': weight_decouple,
             'nesterov': nesterov,
             'ns_steps': ns_steps,
+            'use_adjusted_lr': use_adjusted_lr,
             'adamw_lr': adamw_lr,
             'adamw_lr_ratio': adamw_lr / lr,
             'adamw_betas': betas,
@@ -114,6 +130,11 @@ def reset(self):
                 state['moment1'] = torch.zeros_like(p)
                 state['moment2'] = torch.zeros_like(p)
 
+    @staticmethod
+    def adjust_lr_for_muon(lr: float, param_shape) -> float:
+        adjusted_ratio: float = 0.2 * math.sqrt(max(param_shape[0], param_shape[1]))
+        return lr * adjusted_ratio
+
     @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
         loss: LOSS = None
@@ -137,7 +158,6 @@ def step(self, closure: CLOSURE = None) -> LOSS:
             if len(params) == 0:
                 continue
 
-            lr = group['lr']
             momentum = group['momentum']
 
             total_params: int = sum(p.numel() for p in params)
@@ -149,34 +169,42 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     curr_idx += p.numel()
                     continue
 
-                g = p.grad
-                if g.ndim > 2:
-                    g = g.view(g.size(0), -1)
+                grad = p.grad
+                if grad.ndim > 2:
+                    grad = grad.view(grad.size(0), -1)
 
                 state = self.state[p]
                 if 'momentum_buffer' not in state:
-                    state['momentum_buffer'] = torch.zeros_like(g)
+                    state['momentum_buffer'] = torch.zeros_like(grad)
 
                 buf = state['momentum_buffer']
-                buf.mul_(momentum).add_(g)
+                buf.lerp_(grad, weight=1.0 - momentum)
 
-                if group['nesterov']:
-                    g.add_(buf, alpha=momentum)
-                else:
-                    g = buf
+                grad = grad.lerp_(buf, momentum) if group['nesterov'] else buf
 
-                g = zero_power_via_newton_schulz_5(g, num_steps=group['ns_steps'])
-                g.mul_(max(1.0, g.size(0) / g.size(1)) ** 0.5)
+                grad = zero_power_via_newton_schulz_5(grad, num_steps=group['ns_steps']).flatten()
 
-                updates_flat[curr_idx:curr_idx + p.numel()] = g.flatten()  # fmt: skip
+                updates_flat[curr_idx:curr_idx + p.numel()] = grad  # fmt: skip
 
             if self.world_size > 1:  # pragma: no cover
                 all_reduce(updates_flat, op=ReduceOp.SUM)
 
             curr_idx: int = 0
             for p in params:
-                g = updates_flat[curr_idx:curr_idx + p.numel()].view_as(p).type_as(p)  # fmt: skip
-                p.add_(g, alpha=-lr)
+                g = updates_flat[curr_idx:curr_idx + p.numel()].view_as(p)  # fmt: skip
+
+                self.apply_weight_decay(
+                    p,
+                    grad=g,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    weight_decouple=group['weight_decouple'],
+                    fixed_decay=False,
+                )
+
+                lr: float = self.adjust_lr_for_muon(group['lr'], p.size()) if group['use_adjusted_lr'] else group['lr']
+
+                p.add_(g, alpha=-lr * (max(1.0, p.size(-2) / p.size(-1)) ** 0.5))
                 curr_idx += p.numel()
 
             params = [p for p in group['params'] if p.grad is not None and not self.state[p]['use_muon']]
diff --git a/pytorch_optimizer/optimizer/shampoo_utils.py b/pytorch_optimizer/optimizer/shampoo_utils.py
@@ -528,7 +528,7 @@ def merge_small_dims(shape_to_merge: Union[List[int], torch.Size], max_dim: int)
 
 
 def zero_power_via_newton_schulz_5(
-    g: torch.Tensor, num_steps: int = 10, eps: float = 1e-7, weights: Tuple[int, int, int] = (3.4445, -4.7750, 2.0315)
+    g: torch.Tensor, num_steps: int = 5, eps: float = 1e-7, weights: Tuple[int, int, int] = (3.4445, -4.7750, 2.0315)
 ) -> torch.Tensor:
     r"""Compute the zeroth power / orthogonalization of G.
 
diff --git a/tests/constants.py b/tests/constants.py
@@ -518,8 +518,8 @@
     ),
     (ADOPT, {'lr': 1e0}, 5),
     (FTRL, {'lr': 1e0, 'beta': 0.0, 'lambda_1': 0.0, 'lambda_2': 0.0}, 5),
-    (Muon, {'lr': 1e0, 'ns_steps': 6, 'adam_lr': 1e0, 'adamw_wd': 1e-2}, 5),
-    (Muon, {'lr': 1e0, 'ns_steps': 6, 'adam_lr': 1e0, 'adamw_wd': 1e-2, 'nesterov': False}, 5),
+    (Muon, {'lr': 5e0, 'use_adjusted_lr': True, 'adam_lr': 1e0, 'adamw_wd': 1e-2}, 5),
+    (Muon, {'lr': 1e0, 'adam_lr': 1e0, 'adamw_wd': 1e-2, 'nesterov': False}, 5),
     (LaProp, {'lr': 1e0, 'weight_decay': 1e-3}, 5),
     (LaProp, {'lr': 1e0, 'centered': True, 'weight_decay': 1e-3}, 11),
     (LaProp, {'lr': 1e0, 'ams_bound': True, 'weight_decay': 1e-3}, 5),
diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py
@@ -924,15 +924,15 @@ def test_muon_rank(rank):
     model = nn.Sequential(
         nn.Conv1d(1, 1, 1),
         nn.Conv1d(1, 1, 1),
-        nn.Conv1d(1, 1, 1),
+        nn.Conv2d(1, 1, (2, 2)),
     )
 
     optimizer = Muon(model.parameters())
     optimizer.zero_grad()
 
     model[0].weight.grad = torch.randn(1, 1, 1)
     model[1].weight.grad = torch.randn(1, 1, 1)
-    model[2].weight.grad = torch.randn(1, 1, 1)
+    model[2].weight.grad = torch.randn(1, 1, 2, 2)
 
     optimizer.step()