refactor: staticmethod to utils

kozistr · kozistr · commit a3c6561a5901 · 2022-02-20T18:20:19.000+09:00
diff --git a/pytorch_optimizer/adamp.py b/pytorch_optimizer/adamp.py
@@ -1,13 +1,13 @@
 import math
-from typing import Callable, List, Tuple
+from typing import List, Tuple
 
 import torch
-import torch.nn.functional as F
 from torch.optim.optimizer import Optimizer
 
 from pytorch_optimizer.base_optimizer import BaseOptimizer
 from pytorch_optimizer.gc import centralize_gradient
 from pytorch_optimizer.types import BETAS, CLOSURE, DEFAULTS, LOSS, PARAMETERS
+from pytorch_optimizer.utils import channel_view, cosine_similarity_by_view, layer_view
 
 
 class AdamP(Optimizer, BaseOptimizer):
@@ -80,25 +80,6 @@ def validate_parameters(self):
         self.validate_weight_decay_ratio(self.wd_ratio)
         self.validate_epsilon(self.eps)
 
-    @staticmethod
-    def channel_view(x: torch.Tensor) -> torch.Tensor:
-        return x.view(x.size()[0], -1)
-
-    @staticmethod
-    def layer_view(x: torch.Tensor) -> torch.Tensor:
-        return x.view(1, -1)
-
-    @staticmethod
-    def cosine_similarity(
-        x: torch.Tensor,
-        y: torch.Tensor,
-        eps: float,
-        view_func: Callable[[torch.Tensor], torch.Tensor],
-    ) -> torch.Tensor:
-        x = view_func(x)
-        y = view_func(y)
-        return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()
-
     def projection(
         self,
         p,
@@ -110,8 +91,8 @@ def projection(
     ) -> Tuple[torch.Tensor, float]:
         wd: float = 1.0
         expand_size: List[int] = [-1] + [1] * (len(p.shape) - 1)
-        for view_func in (self.channel_view, self.layer_view):
-            cosine_sim = self.cosine_similarity(grad, p, eps, view_func)
+        for view_func in (channel_view, layer_view):
+            cosine_sim = cosine_similarity_by_view(grad, p, eps, view_func)
 
             if cosine_sim.max() < delta / math.sqrt(view_func(p).size()[1]):
                 p_n = p / view_func(p).norm(dim=1).view(expand_size).add_(eps)
diff --git a/pytorch_optimizer/sgdp.py b/pytorch_optimizer/sgdp.py
@@ -1,12 +1,12 @@
 import math
-from typing import Callable, List, Tuple
+from typing import List, Tuple
 
 import torch
-from torch.nn import functional as F
 from torch.optim.optimizer import Optimizer
 
 from pytorch_optimizer.base_optimizer import BaseOptimizer
 from pytorch_optimizer.types import CLOSURE, DEFAULTS, LOSS, PARAMETERS
+from pytorch_optimizer.utils import channel_view, cosine_similarity_by_view, layer_view
 
 
 class SGDP(Optimizer, BaseOptimizer):
@@ -74,25 +74,6 @@ def validate_parameters(self):
         self.validate_weight_decay_ratio(self.wd_ratio)
         self.validate_epsilon(self.eps)
 
-    @staticmethod
-    def channel_view(x: torch.Tensor) -> torch.Tensor:
-        return x.view(x.size()[0], -1)
-
-    @staticmethod
-    def layer_view(x: torch.Tensor) -> torch.Tensor:
-        return x.view(1, -1)
-
-    @staticmethod
-    def cosine_similarity(
-        x: torch.Tensor,
-        y: torch.Tensor,
-        eps: float,
-        view_func: Callable[[torch.Tensor], torch.Tensor],
-    ):
-        x = view_func(x)
-        y = view_func(y)
-        return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()
-
     def projection(
         self,
         p,
@@ -104,8 +85,8 @@ def projection(
     ) -> Tuple[torch.Tensor, float]:
         wd: float = 1.0
         expand_size: List[int] = [-1] + [1] * (len(p.shape) - 1)
-        for view_func in (self.channel_view, self.layer_view):
-            cosine_sim = self.cosine_similarity(grad, p, eps, view_func)
+        for view_func in (channel_view, layer_view):
+            cosine_sim = cosine_similarity_by_view(grad, p, eps, view_func)
 
             if cosine_sim.max() < delta / math.sqrt(view_func(p).size()[1]):
                 p_n = p / view_func(p).norm(dim=1).view(expand_size).add_(eps)
diff --git a/pytorch_optimizer/utils.py b/pytorch_optimizer/utils.py
@@ -1,10 +1,11 @@
 import math
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 from torch import nn
 from torch.distributed import all_reduce
+from torch.nn import functional as F
 from torch.nn.utils import clip_grad_norm_
 
 from pytorch_optimizer.types import PARAMETERS
@@ -50,6 +51,25 @@ def un_flatten_grad(grads: torch.Tensor, shapes: List[int]) -> List[torch.Tensor
     return un_flatten_grad
 
 
+def channel_view(x: torch.Tensor) -> torch.Tensor:
+    return x.view(x.size()[0], -1)
+
+
+def layer_view(x: torch.Tensor) -> torch.Tensor:
+    return x.view(1, -1)
+
+
+def cosine_similarity_by_view(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    eps: float,
+    view_func: Callable[[torch.Tensor], torch.Tensor],
+) -> torch.Tensor:
+    x = view_func(x)
+    y = view_func(y)
+    return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()
+
+
 def clip_grad_norm(parameters: PARAMETERS, max_norm: float = 0, sync: bool = False) -> Union[torch.Tensor, float]:
     """Clips grad norms.
     During combination with FSDP, will also ensure that grad norms are aggregated