update sm version guard

skyw · skyw · commit c8f80aec3028 · 2025-10-08T20:40:26.000-07:00
Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/emerging_optimizers/orthogonalized_optimizers/muon_utils.py b/emerging_optimizers/orthogonalized_optimizers/muon_utils.py
@@ -146,7 +146,12 @@ def newton_schulz(
         X = X.to(torch.bfloat16)
         logging.log_first_n(logging.INFO, "Using BF16 I/O kernels for Newton-Schulz iteration.", 1)
         if use_syrk:
-            ns_step_fn = newton_schulz_step_tsyrk
+            sm_version = torch.cuda.get_device_capability()
+            if sm_version in ((8, 0), (9, 0), (10, 0), (11, 0)):
+                logging.log_first_n(
+                    logging.INFO, f"Using Triton SYRK kernels for Newton-Schulz iteration on SM {sm_version}.", 1
+                )
+                ns_step_fn = newton_schulz_step_tsyrk
 
     for i in range(steps):
         a, b, c = coefficient_sets[i % len(coefficient_sets)]
diff --git a/emerging_optimizers/triton_kernels/syrk.py b/emerging_optimizers/triton_kernels/syrk.py
@@ -315,10 +315,6 @@ def tsyrk_ex(
     Returns:
         Output tensor of shape (N, N)
     """
-    sm_version = torch.cuda.get_device_capability()
-    assert sm_version in ((8, 0), (9, 0), (10, 0), (11, 0)), (
-        f"Correctness of Triton kernel on SM {sm_version} can not be guaranteed."
-    )
     assert a.dtype == torch.bfloat16, "Input tensor must be bfloat16"
     assert a.dim() == 2, "Input tensor must be 2D"
     assert a.is_contiguous() or a.T.is_contiguous(), "invalid input tensor layout. a or a.T must be contiguous."
diff --git a/tests/test_muon_utils.py b/tests/test_muon_utils.py
@@ -22,6 +22,9 @@
 from emerging_optimizers.orthogonalized_optimizers import muon, muon_utils
 
 
+_SM_VERSION = torch.cuda.get_device_capability() if torch.cuda.is_available() else None
+
+
 def newton_schulz_ref(x: torch.Tensor, coefficient_sets: list[tuple[float, float, float]]) -> torch.Tensor:
     """Reference Newton-Schulz iteration to compute the zeroth power / orthogonalization of x."""
     # Muon is not for 1d parameters
@@ -208,14 +211,11 @@ def test_qkv_split_shapes_validation(self):
         self.assertIn("tuple of 3 integers", str(cm.exception))
 
 
+@absltest.skipIf(
+    _SM_VERSION is None or _SM_VERSION not in ((8, 0), (9, 0), (10, 0), (11, 0)),
+    f"Correctness of Triton kernel on SM {_SM_VERSION} cannot be guaranteed.",
+)
 class TestNewtonSchulzStepWithTsyrk(parameterized.TestCase):
-    def setUp(self):
-        self.prev_precision = torch.get_float32_matmul_precision()
-        torch.set_float32_matmul_precision("highest")
-
-    def tearDown(self):
-        torch.set_float32_matmul_precision(self.prev_precision)
-
     @parameterized.parameters(
         (32, 32),
         (32, 64),