[optim] Handle the case when param groups are passed to optimizer (#2606)

gau-nernst · web-flow · commit 840b7ce5b19f · 2025-07-26T08:19:57.000+08:00
fix param group
diff --git a/test/test_low_bit_optim.py b/test/test_low_bit_optim.py
@@ -187,6 +187,27 @@ def test_optim_default_dtype_bf16(self, optim_name, device):
         finally:
             torch.set_default_dtype(old_dtype)
 
+    @parametrize("optim_name", ["Adam8bit", "Adam4bit", "AdamFp8"])
+    @parametrize("device", _DEVICES)
+    def test_param_groups(self, optim_name, device):
+        if optim_name.endswith("Fp8") and device == "cuda":
+            if torch.cuda.get_device_capability() < (8, 9):
+                pytest.skip("FP8 CUDA requires compute capability >= 8.9")
+
+        model = nn.Sequential(nn.Linear(32, 256), nn.ReLU(), nn.Linear(256, 32))
+        model.to(device=device)
+        param_groups = [
+            dict(params=list(model[0].parameters()), lr=1e-4),
+            dict(params=list(model[2].parameters()), lr=1e-5),
+        ]
+        optimizer = getattr(optim, optim_name)(param_groups)
+
+        x = torch.randn(4, 32, device=device)
+        loss = model(x).sum()
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
     # aten.slice is required for dcp.load() when world size changes i.e. re-sharding
     # however, it's cumbersome to test it directly, since we would need to run distributed
     # test 2 times with different world size, and persist checkpoint across the 2 runs.
diff --git a/torchao/optim/adam.py b/torchao/optim/adam.py
@@ -39,7 +39,7 @@ def __init__(
         if not 0.0 <= betas[1] < 1.0:
             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         defaults = dict(
-            lr=torch.tensor(lr),
+            lr=lr,
             betas=betas,
             eps=eps,
             weight_decay=weight_decay,
@@ -50,6 +50,14 @@ def __init__(
         self.bf16_stochastic_round = bf16_stochastic_round
         self.is_adamw = is_adamw
 
+    def add_param_group(self, param_group: dict) -> None:
+        super().add_param_group(param_group)
+
+        # convert LR to a tensor
+        group = self.param_groups[-1]
+        if not isinstance(group["lr"], Tensor):
+            group["lr"] = torch.tensor(group["lr"], dtype=torch.float32)
+
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups: