update: initialization

kozistr · kozistr · commit 3172b69aa710 · 2023-04-22T21:07:45.000+09:00
diff --git a/pytorch_optimizer/optimizer/adafactor.py b/pytorch_optimizer/optimizer/adafactor.py
@@ -145,8 +145,10 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     state['exp_avg'] = torch.zeros_like(p)
 
                     if factored:
-                        state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1]).to(grad)
-                        state['exp_avg_sq_col'] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
+                        state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1], dtype=grad.dtype, device=grad.device)
+                        state['exp_avg_sq_col'] = torch.zeros(
+                            grad_shape[:-2] + grad_shape[-1:], dtype=grad.dtype, device=grad.device
+                        )
                     else:
                         state['exp_avg_sq'] = torch.zeros_like(grad)
 
diff --git a/pytorch_optimizer/optimizer/sm3.py b/pytorch_optimizer/optimizer/sm3.py
@@ -98,13 +98,13 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     state['momentum_buffer'] = torch.zeros_like(p)
 
                     if grad.is_sparse:
-                        state['accumulator_0'] = torch.zeros(shape[0], device=grad.device)
+                        state['accumulator_0'] = torch.zeros(shape[0], dtype=grad.dtype, device=grad.device)
                     elif rank == 0:
                         state['accumulator_0'] = torch.zeros_like(p)
                     else:
                         for i in range(rank):
                             state[f'accumulator_{i}'] = torch.zeros(
-                                [1] * i + [shape[i]] + [1] * (rank - 1 - i), device=grad.device
+                                [1] * i + [shape[i]] + [1] * (rank - 1 - i), dtype=grad.dtype, device=grad.device
                             )
 
                 state['step'] += 1