update orth optimizer tests

skyw · skyw · commit fb57fe53296b · 2025-11-05T15:41:06.000-08:00
Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -178,9 +178,12 @@ source = ["emerging_optimizers/", "/workspace/emerging_optimizers"]
 [tool.coverage.report]
 exclude_lines = [
     "raise ValueError",
-    "except ImportError"
+    "except ImportError",
 ]
 exclude_also = [
-    "@triton"
+    "@triton",
+    ".*sm_version",
+    "if closure",
+    "loss = closure"
 ]
 
diff --git a/tests/test_orthogonalized_optimizer.py b/tests/test_orthogonalized_optimizer.py
@@ -22,6 +22,29 @@
 
 
 class OrthogonalizedOptimizerTest(parameterized.TestCase):
+    @parameterized.product(
+        use_independent_wd=[True, False],
+        use_decoupled_wd=[True, False],
+        shape=[(5, 7), (33, 65), (127, 257)],
+        use_nesterov=[True, False],
+        fp32_matmul_prec=["highest", "medium", "low"],
+    )
+    def test_smoke(self, use_independent_wd, use_decoupled_wd, shape, use_nesterov, fp32_matmul_prec) -> None:
+        test_param = nn.Parameter(torch.randint(-5, 5, shape, dtype=torch.float32, device="cuda"))
+        test_param.grad = torch.randint_like(test_param, -5, 5)
+
+        orthogonalized_opt = OrthogonalizedOptimizer(
+            [test_param],
+            lr=2,
+            momentum_beta=0,
+            weight_decay=0.5,
+            use_nesterov=use_nesterov,
+            use_decoupled_wd=use_decoupled_wd,
+            use_independent_wd=use_independent_wd,
+            fp32_matmul_prec=fp32_matmul_prec,
+        )
+        orthogonalized_opt.step()
+
     @parameterized.parameters(
         {"shape": (5, 7)},
         {"shape": (33, 65)},
@@ -195,23 +218,22 @@ def test_use_independent_wd(self) -> None:
 
         # Test with independent weight decay: with lr=0, weight decay should still be applied
         # With lr=0, no gradient update occurs, so param should be exactly (1-wd)*param
-        indep_param = nn.Parameter(torch.randint(-5, 5, shape, dtype=torch.float32, device="cuda"))
-        indep_param_initial = indep_param.data.clone()
-        indep_param.grad = torch.randint_like(indep_param, -5, 5)
+        test_param = nn.Parameter(torch.randint(-5, 5, shape, dtype=torch.float32, device="cuda"))
+        test_param.grad = torch.randint_like(test_param, -5, 5)
+        # With independent weight decay and lr=0, param should be exactly (1-wd)*param
+        expected_param = (1 - weight_decay) * test_param.data
 
         muon_opt_indep = muon.Muon(
-            [indep_param],
+            [test_param],
             lr=0.0,  # Zero learning rate
             weight_decay=weight_decay,
             use_independent_wd=True,
             momentum_beta=0.0,
         )
         muon_opt_indep.step()
 
-        # With independent weight decay and lr=0, param should be exactly (1-wd)*param
-        expected_param = (1 - weight_decay) * indep_param_initial
         torch.testing.assert_close(
-            indep_param.data,
+            test_param,
             expected_param,
             atol=0,
             rtol=0,

Original file line number	Diff line number	Diff line change
`@@ -178,9 +178,12 @@ source = ["emerging_optimizers/", "/workspace/emerging_optimizers"]`
`178`	`178`	`[tool.coverage.report]`
`179`	`179`	`exclude_lines = [`
`180`	`180`	`"raise ValueError",`
`181`		`- "except ImportError"`
	`181`	`+ "except ImportError",`
`182`	`182`	`]`
`183`	`183`	`exclude_also = [`
`184`		`- "@triton"`
	`184`	`+ "@triton",`
	`185`	`+ ".*sm_version",`
	`186`	`+ "if closure",`
	`187`	`+ "loss = closure"`
`185`	`188`	`]`
`186`	`189`