refactor: types

kozistr · kozistr · commit 2e41dfa9c95c · 2021-09-22T00:57:42.000+09:00
diff --git a/pytorch_optimizer/adamp.py b/pytorch_optimizer/adamp.py
@@ -1,24 +1,26 @@
 import math
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Callable, List, Tuple
 
 import torch
 import torch.nn.functional as F
 from torch.optim.optimizer import Optimizer
 
+from pytorch_optimizer.types import BETAS, CLOSURE, DEFAULT_PARAMETERS, LOSS
+
 
 class AdamP(Optimizer):
     def __init__(
         self,
         params,
         lr: float = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.999),
+        betas: BETAS = (0.9, 0.999),
         eps: float = 1e-8,
         weight_decay: float = 0.0,
         delta: float = 0.1,
         wd_ratio: float = 0.1,
         nesterov: bool = False,
     ):
-        defaults: Dict[str, Any] = dict(
+        defaults: DEFAULT_PARAMETERS = dict(
             lr=lr,
             betas=betas,
             eps=eps,
@@ -39,7 +41,10 @@ def layer_view(x: torch.Tensor) -> torch.Tensor:
 
     @staticmethod
     def cosine_similarity(
-        x: torch.Tensor, y: torch.Tensor, eps: float, view_func: Callable
+        x: torch.Tensor,
+        y: torch.Tensor,
+        eps: float,
+        view_func: Callable[[torch.Tensor], torch.Tensor],
     ):
         x = view_func(x)
         y = view_func(y)
@@ -74,8 +79,8 @@ def projection(
 
         return perturb, wd
 
-    def step(self, closure: Optional[Callable] = None) -> float:
-        loss: Optional[float] = None
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
         if closure is not None:
             loss = closure()
 
@@ -114,7 +119,6 @@ def step(self, closure: Optional[Callable] = None) -> float:
                 else:
                     perturb = exp_avg / denom
 
-                # Projection
                 wd_ratio: float = 1
                 if len(p.shape) > 1:
                     perturb, wd_ratio = self.projection(
diff --git a/pytorch_optimizer/lookahead.py b/pytorch_optimizer/lookahead.py
@@ -1,19 +1,27 @@
 from collections import defaultdict
-from typing import Callable, Dict, List, Optional
+from typing import Dict
 
 import torch
 from torch.optim import Optimizer
 
+from pytorch_optimizer.types import (
+    CLOSURE,
+    LOSS,
+    PARAM_GROUP,
+    PARAM_GROUPS,
+    STATE,
+)
+
 
 class Lookahead(Optimizer):
     def __init__(self, optimizer: Optimizer, k: int = 5, alpha: float = 0.5):
         self.optimizer = optimizer
         self.k = k
         self.alpha = alpha
 
-        self.param_groups: List[Dict] = self.optimizer.param_groups
-        self.fast_state: Dict = self.optimizer.state
-        self.state = defaultdict(dict)
+        self.param_groups: PARAM_GROUPS = self.optimizer.param_groups
+        self.fast_state: STATE = self.optimizer.state
+        self.state: STATE = defaultdict(dict)
 
         for group in self.param_groups:
             group['counter'] = 0
@@ -32,8 +40,8 @@ def update_lookahead(self):
         for group in self.param_groups:
             self.update(group)
 
-    def step(self, closure: Optional[Callable] = None) -> float:
-        loss: float = self.optimizer.step(closure)
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = self.optimizer.step(closure)
         for group in self.param_groups:
             if group['counter'] == 0:
                 self.update(group)
@@ -42,12 +50,12 @@ def step(self, closure: Optional[Callable] = None) -> float:
                 group['counter'] = 0
         return loss
 
-    def state_dict(self) -> Dict[str, torch.Tensor]:
-        fast_state_dict = self.optimizer.state_dict()
+    def state_dict(self) -> STATE:
+        fast_state_dict: STATE = self.optimizer.state_dict()
         fast_state = fast_state_dict['state']
         param_groups = fast_state_dict['param_groups']
 
-        slow_state: Dict[int, torch.Tensor] = {
+        slow_state: STATE = {
             (id(k) if isinstance(k, torch.Tensor) else k): v
             for k, v in self.state.items()
         }
@@ -58,12 +66,12 @@ def state_dict(self) -> Dict[str, torch.Tensor]:
             'param_groups': param_groups,
         }
 
-    def load_state_dict(self, state_dict: Dict[str, torch.Tensor]):
-        slow_state_dict: Dict[str, torch.Tensor] = {
+    def load_state_dict(self, state_dict: STATE):
+        slow_state_dict: STATE = {
             'state': state_dict['slow_state'],
             'param_groups': state_dict['param_groups'],
         }
-        fast_state_dict: Dict[str, torch.Tensor] = {
+        fast_state_dict: STATE = {
             'state': state_dict['fast_state'],
             'param_groups': state_dict['param_groups'],
         }
@@ -72,6 +80,6 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]):
         self.optimizer.load_state_dict(fast_state_dict)
         self.fast_state = self.optimizer.state
 
-    def add_param_group(self, param_group: Dict):
+    def add_param_group(self, param_group: PARAM_GROUP):
         param_group['counter'] = 0
         self.optimizer.add_param_group(param_group)
diff --git a/pytorch_optimizer/madgrad.py b/pytorch_optimizer/madgrad.py
@@ -1,9 +1,10 @@
 import math
-from typing import Any, Callable, Dict, Optional
 
 import torch
 from torch.optim import Optimizer
 
+from pytorch_optimizer.types import CLOSURE, DEFAULT_PARAMETERS, LOSS
+
 
 class MADGRAD(Optimizer):
     """
@@ -26,7 +27,7 @@ def __init__(
 
         self.check_valid_parameters()
 
-        defaults: Dict[str, Any] = dict(
+        defaults: DEFAULT_PARAMETERS = dict(
             lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay
         )
         super().__init__(params, defaults)
@@ -49,15 +50,13 @@ def supports_memory_efficient_fp16(self) -> bool:
     def supports_flat_params(self) -> bool:
         return True
 
-    def step(
-        self, closure: Optional[Callable[[], float]] = None
-    ) -> Optional[float]:
+    def step(self, closure: CLOSURE = None) -> LOSS:
         """Performs a single optimization step.
         Arguments:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
-        loss: Optional[float] = None
+        loss: LOSS = None
         if closure is not None:
             loss = closure()
 
diff --git a/pytorch_optimizer/radam.py b/pytorch_optimizer/radam.py
@@ -1,9 +1,11 @@
 import math
-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Dict
 
 import torch
 from torch.optim.optimizer import Optimizer
 
+from pytorch_optimizer.types import BETAS, CLOSURE, DEFAULT_PARAMETERS, LOSS
+
 
 class RAdam(Optimizer):
     """
@@ -15,7 +17,7 @@ def __init__(
         self,
         params,
         lr: float = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.999),
+        betas: BETAS = (0.9, 0.999),
         eps: float = 1e-8,
         weight_decay: float = 0.0,
         n_sma_threshold: int = 5,
@@ -42,7 +44,7 @@ def __init__(
                 ):
                     param['buffer'] = [[None, None, None] for _ in range(10)]
 
-        defaults: Dict[str, Any] = dict(
+        defaults: DEFAULT_PARAMETERS = dict(
             lr=lr,
             betas=betas,
             eps=eps,
@@ -67,8 +69,8 @@ def check_valid_parameters(self):
     def __setstate__(self, state: Dict):
         super().__setstate__(state)
 
-    def step(self, closure: Optional[Callable] = None) -> float:
-        loss: Optional[float] = None
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
         if closure is not None:
             loss = closure()
 
diff --git a/pytorch_optimizer/ranger.py b/pytorch_optimizer/ranger.py
@@ -1,9 +1,17 @@
 import math
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Dict
 
 import torch
 from torch.optim.optimizer import Optimizer
 
+from pytorch_optimizer.types import (
+    BETAS,
+    BUFFER,
+    CLOSURE,
+    DEFAULT_PARAMETERS,
+    LOSS,
+)
+
 
 class Ranger(Optimizer):
     """
@@ -21,7 +29,7 @@ def __init__(
         alpha: float = 0.5,
         k: int = 6,
         n_sma_threshold: int = 5,
-        betas: Tuple[float, float] = (0.95, 0.999),
+        betas: BETAS = (0.95, 0.999),
         eps: float = 1e-5,
         weight_decay: float = 0.0,
         use_gc: bool = True,
@@ -37,13 +45,11 @@ def __init__(
         self.use_gc = use_gc
 
         self.gc_gradient_threshold: int = 3 if gc_conv_only else 1
-        self.buffer: List[List[Optional[torch.Tensor]]] = [
-            [None, None, None] for _ in range(10)
-        ]
+        self.buffer: BUFFER = [[None, None, None] for _ in range(10)]
 
         self.check_valid_parameters()
 
-        defaults: Dict[str, Any] = dict(
+        defaults: DEFAULT_PARAMETERS = dict(
             lr=lr,
             alpha=alpha,
             k=k,
@@ -72,8 +78,8 @@ def check_valid_parameters(self):
     def __setstate__(self, state: Dict):
         super().__setstate__(state)
 
-    def step(self, _: Optional[Callable] = None) -> float:
-        loss: Optional[float] = None
+    def step(self, _: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
 
         for group in self.param_groups:
             for p in group['params']:
diff --git a/pytorch_optimizer/ranger21.py b/pytorch_optimizer/ranger21.py
@@ -9,7 +9,7 @@
 
 import collections
 import math
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
 
 import numpy as np
 import torch
@@ -19,6 +19,7 @@
 from pytorch_optimizer.agc import agc
 from pytorch_optimizer.chebyshev_schedule import get_chebyshev_schedule
 from pytorch_optimizer.gc import centralize_gradient
+from pytorch_optimizer.types import BETAS, CLOSURE, DEFAULT_PARAMETERS, LOSS
 from pytorch_optimizer.utils import normalize_gradient, unit_norm
 
 
@@ -43,7 +44,7 @@ def __init__(
         use_adaptive_gradient_clipping: bool = True,
         agc_clipping_value: float = 1e-2,
         agc_eps: float = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.999),
+        betas: BETAS = (0.9, 0.999),
         momentum_type: str = 'pnm',
         pnm_momentum_factor: float = 1.0,
         momentum: float = 0.9,
@@ -62,7 +63,7 @@ def __init__(
         warmup_pct_default: float = 0.22,
         logging_active: bool = True,
     ):
-        defaults: Dict[str, Any] = dict(
+        defaults: DEFAULT_PARAMETERS = dict(
             lr=lr,
             momentum=momentum,
             betas=betas,
@@ -313,8 +314,8 @@ def get_state_values(group, state):
         return beta1, beta2, mean_avg, variance_avg
 
     @torch.no_grad()
-    def step(self, closure: Optional[Callable] = None):
-        loss = None
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
         if closure is not None and isinstance(closure, collections.Callable):
             with torch.enable_grad():
                 loss = closure()
diff --git a/pytorch_optimizer/sgdp.py b/pytorch_optimizer/sgdp.py
@@ -1,10 +1,12 @@
 import math
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Callable, List, Tuple
 
 import torch
 import torch.nn.functional as F
 from torch.optim.optimizer import Optimizer
 
+from pytorch_optimizer.types import CLOSURE, DEFAULT_PARAMETERS, LOSS
+
 
 class SGDP(Optimizer):
     def __init__(
@@ -19,7 +21,7 @@ def __init__(
         delta: float = 0.1,
         wd_ratio: float = 0.1,
     ):
-        defaults: Dict[str, Any] = dict(
+        defaults: DEFAULT_PARAMETERS = dict(
             lr=lr,
             momentum=momentum,
             dampening=dampening,
@@ -41,7 +43,10 @@ def layer_view(x: torch.Tensor) -> torch.Tensor:
 
     @staticmethod
     def cosine_similarity(
-        x: torch.Tensor, y: torch.Tensor, eps: float, view_func: Callable
+        x: torch.Tensor,
+        y: torch.Tensor,
+        eps: float,
+        view_func: Callable[[torch.Tensor], torch.Tensor],
     ):
         x = view_func(x)
         y = view_func(y)
@@ -76,8 +81,8 @@ def projection(
 
         return perturb, wd
 
-    def step(self, closure: Optional[Callable] = None) -> float:
-        loss: Optional[float] = None
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
         if closure is not None:
             loss = closure()
 
diff --git a/pytorch_optimizer/types.py b/pytorch_optimizer/types.py
@@ -10,3 +10,4 @@
 PARAM_GROUP = Dict
 PARAM_GROUPS = List[PARAM_GROUP]
 STATE = Dict[str, Any]
+BUFFER = List[List[Optional[torch.Tensor]]]