refactor: PCGrad

kozistr · kozistr · commit 8f5892c65c60 · 2022-02-20T18:16:50.000+09:00
diff --git a/pytorch_optimizer/pcgrad.py b/pytorch_optimizer/pcgrad.py
@@ -2,12 +2,12 @@
 from copy import deepcopy
 from typing import Iterable, List, Tuple
 
-import numpy as np
 import torch
 from torch import nn
 from torch.optim.optimizer import Optimizer
 
 from pytorch_optimizer.base_optimizer import BaseOptimizer
+from pytorch_optimizer.utils import flatten_grad, un_flatten_grad
 
 
 class PCGrad(BaseOptimizer):
@@ -41,20 +41,6 @@ def validate_parameters(self):
     def reset(self):
         pass
 
-    @staticmethod
-    def flatten_grad(grads: List[torch.Tensor]) -> torch.Tensor:
-        return torch.cat([g.flatten() for g in grads])
-
-    @staticmethod
-    def un_flatten_grad(grads: torch.Tensor, shapes: List[int]) -> List[torch.Tensor]:
-        idx: int = 0
-        un_flatten_grad: List[torch.Tensor] = []
-        for shape in shapes:
-            length = np.prod(shape)
-            un_flatten_grad.append(grads[idx : idx + length].view(shape).clone())
-            idx += length
-        return un_flatten_grad
-
     def zero_grad(self):
         return self.optimizer.zero_grad(set_to_none=True)
 
@@ -97,8 +83,8 @@ def pack_grad(self, objectives: Iterable) -> Tuple[List[torch.Tensor], List[List
 
             grad, shape, has_grad = self.retrieve_grad()
 
-            grads.append(self.flatten_grad(grad))
-            has_grads.append(self.flatten_grad(has_grad))
+            grads.append(flatten_grad(grad))
+            has_grads.append(flatten_grad(has_grad))
             shapes.append(shape)
 
         return grads, shapes, has_grads
@@ -136,6 +122,6 @@ def pc_backward(self, objectives: Iterable[nn.Module]):
         """
         grads, shapes, has_grads = self.pack_grad(objectives)
         pc_grad = self.project_conflicting(grads, has_grads)
-        pc_grad = self.un_flatten_grad(pc_grad, shapes[0])
+        pc_grad = un_flatten_grad(pc_grad, shapes[0])
 
         self.set_grad(pc_grad)
diff --git a/pytorch_optimizer/utils.py b/pytorch_optimizer/utils.py
@@ -1,6 +1,7 @@
 import math
 from typing import List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 from torch import nn
 from torch.distributed import all_reduce
@@ -35,6 +36,20 @@ def normalize_gradient(x: torch.Tensor, use_channels: bool = False, epsilon: flo
     return x
 
 
+def flatten_grad(grads: List[torch.Tensor]) -> torch.Tensor:
+    return torch.cat([g.flatten() for g in grads])
+
+
+def un_flatten_grad(grads: torch.Tensor, shapes: List[int]) -> List[torch.Tensor]:
+    idx: int = 0
+    un_flatten_grad: List[torch.Tensor] = []
+    for shape in shapes:
+        length = np.prod(shape)
+        un_flatten_grad.append(grads[idx : idx + length].view(shape).clone())
+        idx += length
+    return un_flatten_grad
+
+
 def clip_grad_norm(parameters: PARAMETERS, max_norm: float = 0, sync: bool = False) -> Union[torch.Tensor, float]:
     """Clips grad norms.
     During combination with FSDP, will also ensure that grad norms are aggregated