File tree Expand file tree Collapse file tree 1 file changed +7
-6
lines changed
pytorch_optimizer/optimizer Expand file tree Collapse file tree 1 file changed +7
-6
lines changed Original file line number Diff line number Diff line change @@ -35,11 +35,12 @@ class Muon(BaseOptimizer):
3535 :param ns_steps: int. the number of Newton-Schulz iterations to run. (5 is probably always enough)
3636 :param use_adjusted_lr: bool. whether to use adjusted learning rate, which is from the Moonlight.
3737 reference: https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
38- :param adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are {0, 1}-D or
39- are detected as being the embed or lm_head will be optimized by AdamW as well.
40- :param adamw_lr: The learning rate for the internal AdamW.
41- :param adamw_wd: The weight decay for the internal AdamW.
42- :param adamw_eps: The epsilon for the internal AdamW.
38+ :param adamw_params: Optional[PARAMETERS] The parameters to be optimized by AdamW. Any parameters in `muon_params`
39+ which are {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well. It'd be
40+ better to create AdamW optimizer instead of using this.
41+ :param adamw_lr: float. The learning rate for the internal AdamW.
42+ :param adamw_wd: float. The weight decay for the internal AdamW.
43+ :param adamw_eps: float. The epsilon for the internal AdamW.
4344 """
4445
4546 def __init__ (
@@ -55,7 +56,7 @@ def __init__(
5556 use_adjusted_lr : bool = False ,
5657 adamw_params : Optional [PARAMETERS ] = None ,
5758 adamw_lr : float = 3e-4 ,
58- adamw_wd : float = 0 ,
59+ adamw_wd : float = 0.0 ,
5960 adamw_eps : float = 1e-8 ,
6061 ** kwargs ,
6162 ):
You can’t perform that action at this time.
0 commit comments