Merge pull request #367 from kozistr/fix/schedulefree-wrapper

kozistr · web-flow · commit 3ef7c758c6b5 · 2025-03-16T15:17:38.000+09:00
[Fix] ScheduleFreeWrapper
diff --git a/docs/changelogs/v3.5.0.md b/docs/changelogs/v3.5.0.md
@@ -25,3 +25,4 @@
 
 * bias_correction2 in ScheduleFreeRAdam optimizer. (#354)
 * potential bug in SPAM optimizer. (#365)
+* initialize the `z` state within the `step()` of the ScheduleFreeWrapper. (#363, #366)
diff --git a/pytorch_optimizer/optimizer/schedulefree.py b/pytorch_optimizer/optimizer/schedulefree.py
@@ -505,13 +505,7 @@ def __init__(
         self._optimizer_step_post_hooks: Dict[int, Callable] = {}
 
         self.state: STATE = defaultdict(dict)
-
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-                state['z'] = torch.clone(p)
-
-        self.defaults = self.optimizer.defaults
+        self.defaults: DEFAULTS = self.optimizer.defaults
 
     def __str__(self) -> str:
         return 'ScheduleFree'
@@ -594,6 +588,9 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 state = self.state[p]
 
+                if 'z' not in state:
+                    state['z'] = p.clone()
+
                 z = state['z']
 
                 self.apply_weight_decay(
@@ -633,7 +630,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
             weight: float = (group['step'] ** group['lr']) * (lr_max ** self.weight_lr_power)  # fmt: skip
             weight_sum = group['weight_sum'] = group.get('weight_sum', 0.0) + weight
 
-            ckeckpoint: float = weight / weight_sum if weight_sum != 0.0 else 0.0
+            checkpoint: float = weight / weight_sum if weight_sum != 0.0 else 0.0
 
             for p in group['params']:
                 if p.grad is None:
@@ -645,7 +642,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 self.swap(z, p)
 
-                p.lerp_(end=z, weight=ckeckpoint)
+                p.lerp_(end=z, weight=checkpoint)
 
                 p.lerp_(end=state['z'], weight=1.0 - self.momentum)
 
diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py
@@ -1026,9 +1026,17 @@ def test_schedulefree_wrapper():
     _ = optimizer.__getstate__()
     _ = optimizer.param_groups
 
-    optimizer.load_state_dict(optimizer.state_dict())
+    optimizer.step()
+
+    backup_state = optimizer.state_dict()
+
+    optimizer = ScheduleFreeWrapper(load_optimizer('adamw')(model.parameters(), lr=1e-3, weight_decay=1e-3))
+    optimizer.reset()
+    optimizer.zero_grad()
+    optimizer.train()
+
+    optimizer.load_state_dict(backup_state)
 
-    optimizer.optimizer.step()
     optimizer.step()
 
     optimizer.eval()

Original file line number	Diff line number	Diff line change
`@@ -25,3 +25,4 @@`
`25`	`25`
`26`	`26`	`* bias_correction2 in ScheduleFreeRAdam optimizer. (#354)`
`27`	`27`	`* potential bug in SPAM optimizer. (#365)`
	`28`	+* initialize the `z` state within the `step()` of the ScheduleFreeWrapper. (#363, #366)