docs: docstring

kozistr · kozistr · commit 37d2e030c784 · 2023-01-31T14:20:46.000+09:00
diff --git a/pytorch_optimizer/base/optimizer.py b/pytorch_optimizer/base/optimizer.py
@@ -7,6 +7,8 @@
 
 
 class BaseOptimizer(ABC):
+    r"""Base optimizer class."""
+
     @staticmethod
     def validate_learning_rate(learning_rate: float):
         if learning_rate < 0.0:
diff --git a/pytorch_optimizer/base/scheduler.py b/pytorch_optimizer/base/scheduler.py
@@ -6,7 +6,9 @@
 
 
 class BaseLinearWarmupScheduler(ABC):
-    r"""BaseLinearWarmupScheduler class. The LR Scheduler class based on this class has linear warmup strategy.
+    r"""BaseLinearWarmupScheduler class.
+
+        The LR Scheduler class based on this class has linear warmup strategy.
 
     :param optimizer: Optimizer. OPTIMIZER. It will set learning rate to all trainable parameters in optimizer.
     :param t_max: int. total steps to train.
diff --git a/pytorch_optimizer/lr_scheduler/chebyshev.py b/pytorch_optimizer/lr_scheduler/chebyshev.py
@@ -2,7 +2,7 @@
 
 
 def chebyshev_steps(small_m: float, big_m: float, num_epochs: int) -> np.ndarray:
-    r"""chebyshev_steps.
+    r"""Chebyshev steps.
 
     :param small_m: float. stands for 'm' notation.
     :param big_m:  float. stands for 'M' notation.
@@ -16,13 +16,15 @@ def chebyshev_steps(small_m: float, big_m: float, num_epochs: int) -> np.ndarray
 
 
 def chebyshev_perm(num_epochs: int) -> np.ndarray:
+    r"""Chebyshev permutation."""
     perm = np.array([0])
     while len(perm) < num_epochs:
         perm = np.vstack([perm, 2 * len(perm) - 1 - perm]).T.flatten()
     return perm
 
 
 def get_chebyshev_schedule(num_epochs: int) -> np.ndarray:
+    r"""Get Chebyshev schedules."""
     steps: np.ndarray = chebyshev_steps(0.1, 1, num_epochs - 2)
     perm: np.ndarray = chebyshev_perm(num_epochs - 2)
     return steps[perm]
diff --git a/pytorch_optimizer/lr_scheduler/linear_warmup.py b/pytorch_optimizer/lr_scheduler/linear_warmup.py
@@ -6,13 +6,17 @@
 
 
 class LinearScheduler(BaseLinearWarmupScheduler):
+    r"""Linear LR Scheduler w/ linear warmup."""
+
     def _step(self) -> float:
         return self.max_lr + (self.min_lr - self.max_lr) * (self.step_t - self.warmup_steps) / (
             self.total_steps - self.warmup_steps
         )
 
 
 class CosineScheduler(BaseLinearWarmupScheduler):
+    r"""Cosine LR Scheduler w/ linear warmup."""
+
     def _step(self) -> float:
         phase: float = (self.step_t - self.warmup_steps) / (self.total_steps - self.warmup_steps) * math.pi
         return self.min_lr + (self.max_lr - self.min_lr) * (np.cos(phase) + 1.0) / 2.0
diff --git a/pytorch_optimizer/lr_scheduler/proportion.py b/pytorch_optimizer/lr_scheduler/proportion.py
@@ -2,7 +2,8 @@
 
 
 class ProportionScheduler:
-    r"""ProportionScheduler (Rho Scheduler of GSAM)
+    r"""ProportionScheduler (Rho Scheduler of GSAM).
+
         This scheduler outputs a value that evolves proportional to lr_scheduler.
 
     :param lr_scheduler: learning rate scheduler.
diff --git a/pytorch_optimizer/optimizer/fp16.py b/pytorch_optimizer/optimizer/fp16.py
@@ -10,6 +10,7 @@
 
 class DynamicLossScaler:
     r"""Dynamically adjusts the loss scaling factor.
+
         Dynamic loss scalers are important in mixed-precision training.
         They help us avoid underflows and overflows in low-precision gradients.
 
@@ -50,8 +51,9 @@ def __init__(
 
     def update_scale(self, overflow: bool):
         r"""Update the loss scale.
-        If overflow exceeds our tolerance, we decrease the loss scale. If the number of
-        iterations since the last overflow exceeds the scale window, we increase the loss scale.
+
+            If overflow exceeds our tolerance, we decrease the loss scale.
+            If the number of iterations since the last overflow exceeds the scale window, we increase the loss scale.
 
         :param overflow: bool. adjust scales to prevent overflow.
         """
@@ -79,17 +81,31 @@ def update_scale(self, overflow: bool):
 
     def decrease_loss_scale(self):
         r"""Decrease the loss scale by self.scale_factor.
-        NOTE: the loss_scale will not go below self.threshold.
+
+        NOTE: the loss_scale will not go below `self.threshold`.
         """
         self.loss_scale /= self.scale_factor
         if self.threshold is not None:
             self.loss_scale = max(self.loss_scale, self.threshold)
 
 
 class SafeFP16Optimizer(Optimizer):
-    def __init__(self, optimizer: OPTIMIZER, aggregate_g_norms: bool = False):
+    r"""Safe FP16 Optimizer.
+
+    :param optimizer: OPTIMIZER.
+    :param aggregate_g_norms: bool. aggregate_g_norms.
+    :param min_loss_scale: float. min_loss_scale.
+    """
+
+    def __init__(
+        self,
+        optimizer: OPTIMIZER,
+        aggregate_g_norms: bool = False,
+        min_loss_scale: float = 2 ** -5,
+    ):  # fmt: skip
         self.optimizer = optimizer
         self.aggregate_g_norms = aggregate_g_norms
+        self.min_loss_scale = min_loss_scale
 
         self.fp16_params = self.get_parameters(optimizer)
         self.fp32_params = self.build_fp32_params(self.fp16_params, flatten=False)
@@ -104,7 +120,6 @@ def __init__(self, optimizer: OPTIMIZER, aggregate_g_norms: bool = False):
         optimizer.param_groups[0]['params'] = self.fp32_params
 
         self.scaler: DynamicLossScaler = DynamicLossScaler(2.0 ** 15)  # fmt: skip
-        self.min_loss_scale: float = 2 ** -5  # fmt: skip
         self.needs_sync: bool = True
 
     @classmethod
@@ -151,6 +166,7 @@ def state_dict(self) -> Dict:
 
     def load_state_dict(self, state_dict: Dict):
         r"""Load an optimizer state dict.
+
             In general, we should prefer the configuration of the existing optimizer instance
             (e.g., learning rate) over that found in the state_dict. This allows us to
             resume training from a checkpoint using a new set of optimizer args.
@@ -162,9 +178,13 @@ def load_state_dict(self, state_dict: Dict):
         self.optimizer.load_state_dict(state_dict)
 
     def backward(self, loss, update_main_grads: bool = False):
-        r"""Computes the sum of gradients of the given tensor w.r.t. graph leaves.
-        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this function
-        additionally dynamically scales the loss to avoid gradient underflow.
+        r"""Compute the sum of gradients of the given tensor w.r.t. graph leaves.
+
+            Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this function
+            additionally dynamically scales the loss to avoid gradient underflow.
+
+        :param loss: float. loss.
+        :param update_main_grads: bool. update main gradient.
         """
         if self.scaler is not None:
             loss = loss * self.scaler.loss_scale
@@ -176,6 +196,7 @@ def backward(self, loss, update_main_grads: bool = False):
             self.update_main_grads()
 
     def sync_fp16_grads_to_fp32(self, multiply_grads: float = 1.0):
+        r"""Sync fp16 to fp32 gradients."""
         if self.needs_sync:
             if self.scaler is not None:
                 # correct for dynamic loss scaler
@@ -195,7 +216,7 @@ def sync_fp16_grads_to_fp32(self, multiply_grads: float = 1.0):
             self.needs_sync = False
 
     def multiply_grads(self, c: float):
-        r"""Multiplies grads by a constant c."""
+        r"""Multiply grads by a constant c."""
         if self.needs_sync:
             self.sync_fp16_grads_to_fp32(c)
         else:
@@ -206,7 +227,7 @@ def update_main_grads(self):
         self.sync_fp16_grads_to_fp32()
 
     def clip_main_grads(self, max_norm: float):
-        r"""Clips gradient norm and updates dynamic loss scaler."""
+        r"""Clip gradient norm and updates dynamic loss scaler."""
         self.sync_fp16_grads_to_fp32()
 
         grad_norm = clip_grad_norm(self.fp32_params, max_norm, sync=self.aggregate_g_norms)
@@ -221,8 +242,8 @@ def clip_main_grads(self, max_norm: float):
             if overflow:
                 self.zero_grad()
                 if self.scaler.loss_scale <= self.min_loss_scale:
-                    # Use FloatingPointError as an uncommon error that parent
-                    # functions can safely catch to stop training.
+                    # Use FloatingPointError as an uncommon error
+                    # that parent functions can safely catch to stop training.
                     self.scaler.loss_scale = prev_scale
 
                     raise FloatingPointError(
@@ -235,7 +256,7 @@ def clip_main_grads(self, max_norm: float):
         return grad_norm
 
     def step(self, closure: CLOSURE = None):
-        r"""Performs a single optimization step."""
+        r"""Perform a single optimization step."""
         self.sync_fp16_grads_to_fp32()
         self.optimizer.step(closure)
 
@@ -246,17 +267,19 @@ def step(self, closure: CLOSURE = None):
             p.data.copy_(p32)
 
     def zero_grad(self):
-        r"""Clears the gradients of all optimized parameters."""
+        r"""Clear the gradients of all optimized parameters."""
         for p in self.fp16_params:
             p.grad = None
         for p32 in self.fp32_params:
             p32.grad.zero_()
         self.needs_sync = False
 
     def get_lr(self) -> float:
+        r"""Get learning rate."""
         return self.optimizer.get_lr()
 
     def set_lr(self, lr: float):
+        r"""Set learning rate."""
         self.optimizer.set_lr(lr)
 
     @property
diff --git a/pytorch_optimizer/optimizer/gsam.py b/pytorch_optimizer/optimizer/gsam.py
@@ -178,10 +178,11 @@ def maybe_no_sync(self):
 
     @torch.no_grad()
     def set_closure(self, loss_fn: nn.Module, inputs: torch.Tensor, targets: torch.Tensor, **kwargs):
-        r"""set closure
-        create self.forward_backward_func, which is a function such that self.forward_backward_func() automatically
-        performs forward and backward passes. This function does not take any arguments, and the inputs and
-        targets data should be pre-set in the definition of partial-function.
+        r"""Set closure.
+
+            Create `self.forward_backward_func`, which is a function such that `self.forward_backward_func()`
+            automatically performs forward and backward passes. This function does not take any arguments,
+            and the inputs and targets data should be pre-set in the definition of partial-function.
 
         :param loss_fn: nn.Module. loss function.
         :param inputs: torch.Tensor. inputs.
diff --git a/pytorch_optimizer/optimizer/lamb.py b/pytorch_optimizer/optimizer/lamb.py
@@ -9,8 +9,9 @@
 
 
 class Lamb(Optimizer, BaseOptimizer):
-    r"""Large Batch Optimization for Deep Learning. This Lamb implementation is based on the paper v3,
-        which does not use de-biasing.
+    r"""Large Batch Optimization for Deep Learning.
+
+        This Lamb implementation is based on the paper v3, which does not use de-biasing.
 
     :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
     :param lr: float. learning rate.
diff --git a/pytorch_optimizer/optimizer/ranger21.py b/pytorch_optimizer/optimizer/ranger21.py
@@ -15,6 +15,7 @@
 
 class Ranger21(Optimizer, BaseOptimizer):
     r"""Integrating the latest deep learning components into a single optimizer.
+
         Here's the components
             * uses the AdamW optimizer as its core (or, optionally, MadGrad)
             * Adaptive gradient clipping
diff --git a/pytorch_optimizer/optimizer/shampoo.py b/pytorch_optimizer/optimizer/shampoo.py
@@ -9,6 +9,7 @@
 
 class Shampoo(Optimizer, BaseOptimizer):
     r"""Preconditioned Stochastic Tensor Optimization.
+
         Reference : https://github.com/google-research/google-research/blob/master/scalable_shampoo/pytorch/shampoo.py.
 
     :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.