Merge pull request #132 from kozistr/fix/device

kozistr · web-flow · commit be0351d0a99b · 2023-04-22T21:13:31.000+09:00
[Fix] variables are not located on the same device with the gradients
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pytorch_optimizer"
-version = "2.6.0"
+version = "2.6.1"
 description = "optimizer & lr scheduler implementations in PyTorch with clean-code, strict types. Also, including useful optimization ideas."
 license = "Apache-2.0"
 authors = ["kozistr <kozistr@gmail.com>"]
diff --git a/pytorch_optimizer/optimizer/adafactor.py b/pytorch_optimizer/optimizer/adafactor.py
@@ -81,8 +81,8 @@ def reset(self):
                 state['exp_avg'] = torch.zeros_like(p)
 
                 if factored:
-                    state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1], dtype=grad.dtype)
-                    state['exp_avg_sq_col'] = torch.zeros(grad_shape[:-2] + grad_shape[-1:], dtype=grad.dtype)
+                    state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1]).to(grad)
+                    state['exp_avg_sq_col'] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
                 else:
                     state['exp_avg_sq'] = torch.zeros_like(grad)
 
@@ -114,8 +114,8 @@ def approximate_sq_grad(
         exp_avg_sq_col: torch.Tensor,
         output: torch.Tensor,
     ):
-        r"""Get approximate squared gradient."""
-        r_factor: torch.Tensor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1)).rsqrt_().unsqueeze(-1)
+        r"""Get approximation of EMA of squared gradient."""
+        r_factor: torch.Tensor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1)
         c_factor: torch.Tensor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
         torch.mul(r_factor, c_factor, out=output)
 
@@ -145,8 +145,10 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     state['exp_avg'] = torch.zeros_like(p)
 
                     if factored:
-                        state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1], dtype=grad.dtype)
-                        state['exp_avg_sq_col'] = torch.zeros(grad_shape[:-2] + grad_shape[-1:], dtype=grad.dtype)
+                        state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1], dtype=grad.dtype, device=grad.device)
+                        state['exp_avg_sq_col'] = torch.zeros(
+                            grad_shape[:-2] + grad_shape[-1:], dtype=grad.dtype, device=grad.device
+                        )
                     else:
                         state['exp_avg_sq'] = torch.zeros_like(grad)
 
@@ -166,11 +168,9 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     exp_avg_sq_row.mul_(beta2_t).add_(update.mean(dim=-1), alpha=1.0 - beta2_t)
                     exp_avg_sq_col.mul_(beta2_t).add_(update.mean(dim=-2), alpha=1.0 - beta2_t)
 
-                    # Approximation of exponential moving average of square of gradient
-                    self.approximate_sq_grad(exp_avg_sq_row, exp_avg_sq_col, update)
+                    self.approximate_sq_grad(exp_avg_sq_row, exp_avg_sq_col, output=update)
                 else:
                     exp_avg_sq = state['exp_avg_sq']
-
                     exp_avg_sq.mul_(beta2_t).add_(update, alpha=1.0 - beta2_t)
                     torch.rsqrt(exp_avg_sq, out=update)
 
diff --git a/pytorch_optimizer/optimizer/sm3.py b/pytorch_optimizer/optimizer/sm3.py
@@ -98,12 +98,14 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     state['momentum_buffer'] = torch.zeros_like(p)
 
                     if grad.is_sparse:
-                        state['accumulator_0'] = torch.zeros(shape[0])
+                        state['accumulator_0'] = torch.zeros(shape[0], dtype=grad.dtype, device=grad.device)
                     elif rank == 0:
-                        state['accumulator_0'] = torch.zeros(shape)
+                        state['accumulator_0'] = torch.zeros_like(p)
                     else:
                         for i in range(rank):
-                            state[f'accumulator_{i}'] = torch.zeros([1] * i + [shape[i]] + [1] * (rank - 1 - i))
+                            state[f'accumulator_{i}'] = torch.zeros(
+                                [1] * i + [shape[i]] + [1] * (rank - 1 - i), dtype=grad.dtype, device=grad.device
+                            )
 
                 state['step'] += 1