[skip ci] docs: docstring

kozistr · kozistr · commit 3a8c2b65944e · 2021-09-23T15:32:05.000+09:00
diff --git a/pytorch_optimizer/adabound.py b/pytorch_optimizer/adabound.py
@@ -53,7 +53,8 @@ def __init__(
         :param weight_decouple: bool. the optimizer uses decoupled weight decay
             as in AdamW
         :param fixed_decay: bool.
-        :param eps: float. term added to the denominator to improve numerical stability
+        :param eps: float. term added to the denominator
+            to improve numerical stability
         :param weight_decay: float. weight decay (L2 penalty)
         :param amsbound: bool. whether to use the AMSBound variant
         """
diff --git a/pytorch_optimizer/adahessian.py b/pytorch_optimizer/adahessian.py
@@ -47,7 +47,8 @@ def __init__(
         :param lr: float. learning rate.
         :param betas: BETAS. coefficients used for computing running averages
             of gradient and the squared hessian trace
-        :param eps: float. term added to the denominator to improve numerical stability
+        :param eps: float. term added to the denominator
+            to improve numerical stability
         :param weight_decay: float. weight decay (L2 penalty)
         :param hessian_power: float. exponent of the hessian trace
         :param update_each: int. compute the hessian trace approximation
diff --git a/pytorch_optimizer/adamp.py b/pytorch_optimizer/adamp.py
@@ -47,7 +47,8 @@ def __init__(
         :param lr: float. learning rate.
         :param betas: BETAS. coefficients used for computing running averages
             of gradient and the squared hessian trace
-        :param eps: float. term added to the denominator to improve numerical stability
+        :param eps: float. term added to the denominator
+            to improve numerical stability
         :param weight_decay: float. weight decay (L2 penalty)
         :param delta: float. threshold that determines
             whether a set of parameters is scale invariant or not
diff --git a/pytorch_optimizer/diffgrad.py b/pytorch_optimizer/diffgrad.py
@@ -43,7 +43,8 @@ def __init__(
         :param lr: float. learning rate.
         :param betas: BETAS. coefficients used for computing running averages
             of gradient and the squared hessian trace
-        :param eps: float. term added to the denominator to improve numerical stability
+        :param eps: float. term added to the denominator
+            to improve numerical stability
         :param weight_decay: float. weight decay (L2 penalty)
         """
 
@@ -60,11 +61,11 @@ def __init__(
         super().__init__(params, defaults)
 
     def check_valid_parameters(self):
-        if 0.0 > self.lr:
+        if self.lr < 0.0:
             raise ValueError(f'Invalid learning rate : {self.lr}')
-        if 0.0 > self.eps:
+        if self.eps < 0.0:
             raise ValueError(f'Invalid eps : {self.eps}')
-        if 0.0 > self.weight_decay:
+        if self.weight_decay < 0.0:
             raise ValueError(f'Invalid weight_decay : {self.weight_decay}')
         if not 0.0 <= self.betas[0] < 1.0:
             raise ValueError(f'Invalid beta_0 : {self.betas[0]}')
@@ -87,7 +88,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 grad = p.grad.data
                 if grad.is_sparse:
                     raise RuntimeError(
-                        'diffGrad does not support sparse gradients, please consider SparseAdam instead'
+                        'diffGrad does not support sparse gradients'
                     )
 
                 state = self.state[p]