3535 use_nesterov: Whether to use Nesterov-style momentum in the internal SGD.
3636 weight_decay: The weight decay used by the optimizer, default to be decoupled weight decay.
3737 See Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101
38- use_decoupled_weight_decay : Whether to use decoupled weight decay, default to be True.
39- use_independent_weight_decay : Whether to use independent weight decay (https://arxiv.org/abs/2510.19093),
38+ use_decoupled_wd : Whether to use decoupled weight decay, default to be True.
39+ use_independent_wd : Whether to use independent weight decay (https://arxiv.org/abs/2510.19093),
4040 default to be False.
4141 fp32_matmul_prec: Precision of the matmul operations in optimizer states GEMM operations.
4242"""
@@ -101,8 +101,8 @@ def __init__(
101101 momentum_beta : float ,
102102 use_nesterov : bool ,
103103 weight_decay : float ,
104- use_decoupled_weight_decay : bool ,
105- use_independent_weight_decay : bool ,
104+ use_decoupled_wd : bool ,
105+ use_independent_wd : bool ,
106106 fp32_matmul_prec : str ,
107107 scaled_orthogonalize_fn : Callable | None = None ,
108108 ** kwargs : Any ,
@@ -117,8 +117,8 @@ def __init__(
117117 momentum_beta = momentum_beta ,
118118 use_nesterov = use_nesterov ,
119119 weight_decay = weight_decay ,
120- use_decoupled_wd = use_decoupled_weight_decay ,
121- use_independent_wd = use_independent_weight_decay ,
120+ use_decoupled_wd = use_decoupled_wd ,
121+ use_independent_wd = use_independent_wd ,
122122 ** kwargs ,
123123 )
124124
0 commit comments