File tree Expand file tree Collapse file tree 4 files changed +12
-8
lines changed Expand file tree Collapse file tree 4 files changed +12
-8
lines changed Original file line number Diff line number Diff line change @@ -53,7 +53,8 @@ def __init__(
5353 :param weight_decouple: bool. the optimizer uses decoupled weight decay
5454 as in AdamW
5555 :param fixed_decay: bool.
56- :param eps: float. term added to the denominator to improve numerical stability
56+ :param eps: float. term added to the denominator
57+ to improve numerical stability
5758 :param weight_decay: float. weight decay (L2 penalty)
5859 :param amsbound: bool. whether to use the AMSBound variant
5960 """
Original file line number Diff line number Diff line change @@ -47,7 +47,8 @@ def __init__(
4747 :param lr: float. learning rate.
4848 :param betas: BETAS. coefficients used for computing running averages
4949 of gradient and the squared hessian trace
50- :param eps: float. term added to the denominator to improve numerical stability
50+ :param eps: float. term added to the denominator
51+ to improve numerical stability
5152 :param weight_decay: float. weight decay (L2 penalty)
5253 :param hessian_power: float. exponent of the hessian trace
5354 :param update_each: int. compute the hessian trace approximation
Original file line number Diff line number Diff line change @@ -47,7 +47,8 @@ def __init__(
4747 :param lr: float. learning rate.
4848 :param betas: BETAS. coefficients used for computing running averages
4949 of gradient and the squared hessian trace
50- :param eps: float. term added to the denominator to improve numerical stability
50+ :param eps: float. term added to the denominator
51+ to improve numerical stability
5152 :param weight_decay: float. weight decay (L2 penalty)
5253 :param delta: float. threshold that determines
5354 whether a set of parameters is scale invariant or not
Original file line number Diff line number Diff line change @@ -43,7 +43,8 @@ def __init__(
4343 :param lr: float. learning rate.
4444 :param betas: BETAS. coefficients used for computing running averages
4545 of gradient and the squared hessian trace
46- :param eps: float. term added to the denominator to improve numerical stability
46+ :param eps: float. term added to the denominator
47+ to improve numerical stability
4748 :param weight_decay: float. weight decay (L2 penalty)
4849 """
4950
@@ -60,11 +61,11 @@ def __init__(
6061 super ().__init__ (params , defaults )
6162
6263 def check_valid_parameters (self ):
63- if 0.0 > self . lr :
64+ if self . lr < 0.0 :
6465 raise ValueError (f'Invalid learning rate : { self .lr } ' )
65- if 0.0 > self . eps :
66+ if self . eps < 0.0 :
6667 raise ValueError (f'Invalid eps : { self .eps } ' )
67- if 0.0 > self . weight_decay :
68+ if self . weight_decay < 0.0 :
6869 raise ValueError (f'Invalid weight_decay : { self .weight_decay } ' )
6970 if not 0.0 <= self .betas [0 ] < 1.0 :
7071 raise ValueError (f'Invalid beta_0 : { self .betas [0 ]} ' )
@@ -87,7 +88,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
8788 grad = p .grad .data
8889 if grad .is_sparse :
8990 raise RuntimeError (
90- 'diffGrad does not support sparse gradients, please consider SparseAdam instead '
91+ 'diffGrad does not support sparse gradients'
9192 )
9293
9394 state = self .state [p ]
You can’t perform that action at this time.
0 commit comments