Merge pull request #32 from kozistr/feature/pc-grad

kozistr · web-flow · commit d5f5d0c2bb15 · 2021-10-06T22:08:09.000+09:00
[Feature] Implement PCGrad
diff --git a/README.rst b/README.rst
@@ -73,17 +73,17 @@ of the ideas are applied in ``Ranger21`` optimizer.
 
 Also, most of the captures are taken from ``Ranger21`` paper.
 
-+------------------------------------------+-------------------------------------+--------------------------------------------+
-| `Adaptive Gradient Clipping`_            | `Gradient Centralization`_          | `Softplus Transformation`_                 |
-+------------------------------------------+-------------------------------------+--------------------------------------------+
-| `Gradient Normalization`_                | `Norm Loss`_                        | `Positive-Negative Momentum`_              |
-+------------------------------------------+-------------------------------------+--------------------------------------------+
-| `Linear learning rate warmup`_           | `Stable weight decay`_              | `Explore-exploit learning rate schedule`_  |
-+------------------------------------------+-------------------------------------+--------------------------------------------+
-| `Lookahead`_                             | `Chebyshev learning rate schedule`_ | `(Adaptive) Sharpness-Aware Minimization`_ |
-+------------------------------------------+-------------------------------------+--------------------------------------------+
-| `On the Convergence of Adam and Beyond`_ |                                     |                                            |
-+------------------------------------------+-------------------------------------+--------------------------------------------+
++------------------------------------------+---------------------------------------------+--------------------------------------------+
+| `Adaptive Gradient Clipping`_            | `Gradient Centralization`_                  | `Softplus Transformation`_                 |
++------------------------------------------+---------------------------------------------+--------------------------------------------+
+| `Gradient Normalization`_                | `Norm Loss`_                                | `Positive-Negative Momentum`_              |
++------------------------------------------+---------------------------------------------+--------------------------------------------+
+| `Linear learning rate warmup`_           | `Stable weight decay`_                      | `Explore-exploit learning rate schedule`_  |
++------------------------------------------+---------------------------------------------+--------------------------------------------+
+| `Lookahead`_                             | `Chebyshev learning rate schedule`_         | `(Adaptive) Sharpness-Aware Minimization`_ |
++------------------------------------------+---------------------------------------------+--------------------------------------------+
+| `On the Convergence of Adam and Beyond`_ | `Gradient Surgery for Multi-Task Learning`_ |                                            |                                            |
++------------------------------------------+---------------------------------------------+--------------------------------------------+
 
 Adaptive Gradient Clipping
 --------------------------
@@ -195,6 +195,11 @@ On the Convergence of Adam and Beyond
 
 - paper : `paper <https://openreview.net/forum?id=ryQu7f-RZ>`__
 
+Gradient Surgery for Multi-Task Learning
+----------------------------------------
+
+- paper : `paper <https://arxiv.org/abs/2001.06782>`__
+
 Citations
 ---------
 
@@ -430,6 +435,17 @@ On the Convergence of Adam and Beyond
       year={2019}
     }
 
+Gradient Surgery for Multi-Task Learning
+
+::
+
+    @article{yu2020gradient,
+      title={Gradient surgery for multi-task learning},
+      author={Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
+      journal={arXiv preprint arXiv:2001.06782},
+      year={2020}
+    }
+
 Author
 ------
 
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -10,10 +10,11 @@
 from pytorch_optimizer.gc import centralize_gradient
 from pytorch_optimizer.lookahead import Lookahead
 from pytorch_optimizer.madgrad import MADGRAD
+from pytorch_optimizer.pcgrad import PCGrad
 from pytorch_optimizer.radam import RAdam
 from pytorch_optimizer.ranger import Ranger
 from pytorch_optimizer.ranger21 import Ranger21
 from pytorch_optimizer.sam import SAM
 from pytorch_optimizer.sgdp import SGDP
 
-__VERSION__ = '0.0.10'
+__VERSION__ = '0.1.0'
diff --git a/pytorch_optimizer/pcgrad.py b/pytorch_optimizer/pcgrad.py
@@ -0,0 +1,138 @@
+import random
+from copy import deepcopy
+from typing import Iterable, List
+
+import numpy as np
+import torch
+from torch import nn
+from torch.optim.optimizer import Optimizer
+
+
+class PCGrad:
+    """
+    Reference : https://github.com/WeiChengTseng/Pytorch-PCGrad
+    Example :
+        from pytorch_optimizer import AdamP, PCGrad
+        ...
+        model = YourModel()
+        optimizer = PCGrad(AdamP(model.parameters()))
+
+        loss_1, loss_2 = nn.L1Loss(), nn.MSELoss()
+        ...
+        for input, output in data:
+          optimizer.zero_grad()
+          loss1, loss2 = loss1_fn(y_pred, output), loss2_fn(y_pred, output)
+          optimizer.pc_backward([loss1, loss2])
+          optimizer.step()
+    """
+
+    def __init__(self, optimizer: Optimizer, reduction: str = 'mean'):
+        self.optimizer = optimizer
+        self.reduction = reduction
+
+    def check_valid_parameters(self):
+        if self.reduction not in ('mean', 'sum'):
+            raise ValueError(f'invalid reduction : {self.reduction}')
+
+    @staticmethod
+    def flatten_grad(grads) -> torch.Tensor:
+        return torch.cat([g.flatten() for g in grads])
+
+    @staticmethod
+    def un_flatten_grad(grads, shapes) -> List[torch.Tensor]:
+        un_flatten_grad = []
+        idx: int = 0
+        for shape in shapes:
+            length = np.prod(shape)
+            un_flatten_grad.append(grads[idx : idx + length].view(shape).clone())
+            idx += length
+        return un_flatten_grad
+
+    def zero_grad(self):
+        return self.optimizer.zero_grad(set_to_none=True)
+
+    def step(self):
+        return self.optimizer.step()
+
+    def set_grad(self, grads):
+        idx: int = 0
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                p.grad = grads[idx]
+                idx += 1
+
+    def pc_backward(self, objectives: Iterable[nn.Module]):
+        """Calculate the gradient of the parameters
+        :param objectives: Iterable[nn.Module]. a list of objectives
+        :return:
+        """
+        grads, shapes, has_grads = self.pack_grad(objectives)
+        pc_grad = self.project_conflicting(grads, has_grads)
+        pc_grad = self.un_flatten_grad(pc_grad, shapes[0])
+        self.set_grad(pc_grad)
+
+    def project_conflicting(self, grads, has_grads) -> torch.Tensor:
+        """
+        :param grads: a list of the gradient of the parameters
+        :param has_grads: a list of mask represent whether the parameter has gradient
+        :return:
+        """
+        shared = torch.stack(has_grads).prod(0).bool()
+
+        pc_grad = deepcopy(grads)
+        for g_i in pc_grad:
+            random.shuffle(grads)
+            for g_j in grads:
+                g_i_g_j = torch.dot(g_i, g_j)
+                if g_i_g_j < 0:
+                    g_i -= g_i_g_j * g_j / (g_j.norm() ** 2)
+
+        merged_grad = torch.zeros_like(grads[0]).to(grads[0].device)
+        merged_grad[shared] = torch.stack([g[shared] for g in pc_grad])
+
+        if self.reduction == 'mean':
+            merged_grad = merged_grad.mean(dim=0)
+        else:  # self.reduction == 'sum'
+            merged_grad = merged_grad.sum(dim=0)
+
+        merged_grad[~shared] = torch.stack([g[~shared] for g in pc_grad]).sum(dim=0)
+
+        return merged_grad
+
+    def retrieve_grad(self):
+        """Get the gradient of the parameters of the network with specific objective
+        :return:
+        """
+        grad, shape, has_grad = [], [], []
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    shape.append(p.shape)
+                    grad.append(torch.zeros_like(p).to(p.device))
+                    has_grad.append(torch.zeros_like(p).to(p.device))
+                    continue
+
+                shape.append(p.grad.shape)
+                grad.append(p.grad.clone())
+                has_grad.append(torch.ones_like(p).to(p.device))
+
+        return grad, shape, has_grad
+
+    def pack_grad(self, objectives: Iterable[nn.Module]):
+        """Pack the gradient of the parameters of the network for each objective
+        :param objectives: Iterable[float]. a list of objectives
+        :return:
+        """
+        grads, shapes, has_grads = [], [], []
+        for objective in objectives:
+            self.zero_grad()
+
+            objective.backward(retain_graph=True)
+
+            grad, shape, has_grad = self.retrieve_grad()
+
+            grads.append(self.flatten_grad(grad))
+            has_grads.append(self.flatten_grad(has_grad))
+            shapes.append(shape)
+
+        return grads, shapes, has_grads