update: p to grad

kozistr · kozistr · commit 27db16995c93 · 2025-03-06T15:30:25.000+09:00
diff --git a/pytorch_optimizer/optimizer/sgd.py b/pytorch_optimizer/optimizer/sgd.py
@@ -389,7 +389,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 state = self.state[p]
                 if momentum > 0.0:
                     if len(state) == 0:
-                        state['momentum_buffer'] = torch.zeros_like(p)
+                        state['momentum_buffer'] = torch.zeros_like(grad)
 
                     buf = state['momentum_buffer']
                     buf.mul_(momentum).add_(grad, alpha=1.0 - momentum)
diff --git a/pytorch_optimizer/optimizer/sgdp.py b/pytorch_optimizer/optimizer/sgdp.py
@@ -62,11 +62,7 @@ def __str__(self) -> str:
 
     @torch.no_grad()
     def reset(self):
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-
-                state['momentum'] = torch.zeros_like(p)
+        pass
 
     @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
@@ -87,7 +83,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 state = self.state[p]
                 if len(state) == 0:
-                    state['momentum'] = torch.zeros_like(p)
+                    state['momentum'] = torch.zeros_like(grad)
 
                 buf = state['momentum']
                 buf.mul_(momentum).add_(grad, alpha=1.0 - group['dampening'])
diff --git a/pytorch_optimizer/optimizer/shampoo.py b/pytorch_optimizer/optimizer/shampoo.py
@@ -303,7 +303,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 state = self.state[p]
                 if len(state) == 0:
-                    state['momentum'] = torch.zeros_like(p)
+                    state['momentum'] = torch.zeros_like(grad)
                     state['pre_conditioner'] = PreConditioner(
                         p,
                         beta2,
diff --git a/pytorch_optimizer/optimizer/sm3.py b/pytorch_optimizer/optimizer/sm3.py
@@ -91,12 +91,12 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 state = self.state[p]
                 if len(state) == 0:
                     state['step'] = 0
-                    state['momentum_buffer'] = torch.zeros_like(p)
+                    state['momentum_buffer'] = torch.zeros_like(grad)
 
                     if grad.is_sparse:
                         state['accumulator_0'] = torch.zeros(shape[0], dtype=grad.dtype, device=grad.device)
                     elif rank == 0:
-                        state['accumulator_0'] = torch.zeros_like(p)
+                        state['accumulator_0'] = torch.zeros_like(grad)
                     else:
                         for i in range(rank):
                             state[f'accumulator_{i}'] = torch.zeros(
diff --git a/pytorch_optimizer/optimizer/soap.py b/pytorch_optimizer/optimizer/soap.py
@@ -161,7 +161,7 @@ def get_orthogonal_matrix_qr(self, state, max_precondition_dim: int = 10000, mer
             # Compute QR decomposition
             # We cast to float32 because:
             #  - torch.linalg.qr does not have support for types like bfloat16 as of PyTorch 2.5.1
-            #  - the correctness / numerical stability of the Q orthogonalization is important for the stability
+            #  - the correctness / numerical stability of the Q orthogonality is important for the stability
             #    of the optimizer
             q, _ = torch.linalg.qr(power_iter.to(torch.float32))
             q = q.to(power_iter.dtype)
diff --git a/pytorch_optimizer/optimizer/sophia.py b/pytorch_optimizer/optimizer/sophia.py
@@ -113,8 +113,8 @@ def step(self, closure: CLOSURE = None, hessian: Optional[List[torch.Tensor]] =
 
                 state = self.state[p]
                 if len(state) == 0:
-                    state['momentum'] = torch.zeros_like(p)
-                    state['hessian_moment'] = torch.zeros_like(p)
+                    state['momentum'] = torch.zeros_like(grad)
+                    state['hessian_moment'] = torch.zeros_like(grad)
 
                 self.apply_weight_decay(
                     p=p,
diff --git a/pytorch_optimizer/optimizer/srmm.py b/pytorch_optimizer/optimizer/srmm.py
@@ -72,8 +72,8 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 state = self.state[p]
                 if len(state) == 0:
-                    state['mov_avg_grad'] = torch.zeros_like(p)
-                    state['mov_avg_param'] = torch.zeros_like(p)
+                    state['mov_avg_grad'] = torch.zeros_like(grad)
+                    state['mov_avg_param'] = torch.zeros_like(grad)
 
                 mov_avg_grad, mov_avg_param = state['mov_avg_grad'], state['mov_avg_param']
 
diff --git a/pytorch_optimizer/optimizer/swats.py b/pytorch_optimizer/optimizer/swats.py
@@ -110,17 +110,17 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 state = self.state[p]
 
                 if len(state) == 0:
-                    state['exp_avg'] = torch.zeros_like(p)
-                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    state['exp_avg'] = torch.zeros_like(grad)
+                    state['exp_avg_sq'] = torch.zeros_like(grad)
                     state['exp_avg2'] = torch.zeros((1,), dtype=grad.dtype, device=grad.device)
                     if group['ams_bound']:
-                        state['max_exp_avg_sq'] = torch.zeros_like(p)
+                        state['max_exp_avg_sq'] = torch.zeros_like(grad)
                     if group['adanorm']:
                         state['exp_grad_norm'] = torch.zeros((1,), dtype=grad.dtype, device=grad.device)
 
                 self.apply_weight_decay(
                     p=p,
-                    grad=p.grad,
+                    grad=grad,
                     lr=group['lr'],
                     weight_decay=group['weight_decay'],
                     weight_decouple=group['weight_decouple'],
diff --git a/pytorch_optimizer/optimizer/tam.py b/pytorch_optimizer/optimizer/tam.py
@@ -54,13 +54,7 @@ def __str__(self) -> str:
 
     @torch.no_grad()
     def reset(self):
-        for group in self.param_groups:
-            group['step'] = 0
-            for p in group['params']:
-                state = self.state[p]
-
-                state['s'] = torch.zeros_like(p)
-                state['momentum_buffer'] = torch.zeros_like(p)
+        pass
 
     @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
@@ -157,14 +151,7 @@ def __str__(self) -> str:
 
     @torch.no_grad()
     def reset(self):
-        for group in self.param_groups:
-            group['step'] = 0
-            for p in group['params']:
-                state = self.state[p]
-
-                state['s'] = torch.zeros_like(p)
-                state['exp_avg'] = torch.zeros_like(p)
-                state['exp_avg_sq'] = torch.zeros_like(p)
+        pass
 
     @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
diff --git a/pytorch_optimizer/optimizer/tiger.py b/pytorch_optimizer/optimizer/tiger.py
@@ -45,11 +45,7 @@ def __str__(self) -> str:
 
     @torch.no_grad()
     def reset(self):
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-
-                state['exp_avg'] = torch.zeros_like(p)
+        pass
 
     @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
@@ -71,7 +67,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 state = self.state[p]
 
                 if len(state) == 0:
-                    state['exp_avg'] = torch.zeros_like(p)
+                    state['exp_avg'] = torch.zeros_like(grad)
 
                 self.apply_weight_decay(
                     p=p,
diff --git a/pytorch_optimizer/optimizer/yogi.py b/pytorch_optimizer/optimizer/yogi.py
@@ -103,14 +103,14 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 state = self.state[p]
 
                 if len(state) == 0:
-                    state['exp_avg'] = torch.full_like(p, fill_value=group['initial_accumulator'])
-                    state['exp_avg_sq'] = torch.full_like(p, fill_value=group['initial_accumulator'])
+                    state['exp_avg'] = torch.full_like(grad, fill_value=group['initial_accumulator'])
+                    state['exp_avg_sq'] = torch.full_like(grad, fill_value=group['initial_accumulator'])
                     if group['adanorm']:
                         state['exp_grad_norm'] = torch.zeros((1,), dtype=grad.dtype, device=grad.device)
 
                 self.apply_weight_decay(
                     p=p,
-                    grad=p.grad,
+                    grad=grad,
                     lr=group['lr'],
                     weight_decay=group['weight_decay'],
                     weight_decouple=group['weight_decouple'],