Disabled TF32 on Amper+ devices to stabilize numeric accuracy (#2579)

mattteochen · web-flow · commit e30133ab1eba · 2025-10-07T15:37:39.000+02:00
diff --git a/thunder/tests/conftest.py b/thunder/tests/conftest.py
@@ -77,3 +77,11 @@ def pytest_collection_modifyitems(items):
 
 def pytest_addoption(parser):
     parser.addoption("--gpu-mem-limit", type=float)
+
+
+@pytest.fixture
+def turn_off_tf32_and_set_seed(monkeypatch):
+    monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
+    torch.manual_seed(42)
+    yield
+    torch.seed()
diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py
@@ -131,6 +131,10 @@ def forward(self, x):
             actual=tp_jitted_model.get_parameter("embed.weight").grad,
         )
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    @pytest.mark.usefixtures("turn_off_tf32_and_set_seed")
     @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="")
     @common_utils.parametrize("bias", (True, False))
     def test_both_column_and_row(self, bias):
@@ -154,6 +158,7 @@ def forward(self, x):
                 return h
 
         device = torch.device("cuda", self.rank)
+
         x = torch.randint(0, num_embeddings - 1, (16, 16), device=device)
         x_ref = x.clone().detach()
 
diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py
@@ -1487,8 +1487,11 @@ def test_populate_grads_block(executor, device, dtype):
     assert_close(torch_grads, thunder_grads, atol=1e-2, rtol=1e-2)
 
 
+# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+# This is expected due to the reduced precision of TF32 matrix multiplications.
 @instantiate(dtypes=(thunder.float32,))
-def test_populate_grads_nanogpt(executor, device, dtype):
+def test_populate_grads_nanogpt(executor, device, dtype, turn_off_tf32_and_set_seed):
     import sys
 
     if sys.platform == "win32":
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
@@ -649,6 +649,9 @@ def test_nanogpt():
     assert_close(result, module(*args, **kwargs))
 
 
+# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+# This is expected due to the reduced precision of TF32 matrix multiplications.
 @skipif_not_pytorch_2_1
 @pytest.mark.parametrize(
     "name",
@@ -668,7 +671,7 @@ def test_nanogpt():
     "device",
     ("cpu", "cuda", "meta"),
 )
-def test_litgpt_variants(name, device):
+def test_litgpt_variants(name, device, turn_off_tf32_and_set_seed):
     from thunder.tests.litgpt_model import Config
     from litgpt.model import GPT
 
@@ -704,6 +707,9 @@ def test_litgpt_variants(name, device):
         torch.testing.assert_close(param1.grad, param2.grad, rtol=1e-2, atol=1e-2)
 
 
+# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+# This is expected due to the reduced precision of TF32 matrix multiplications.
 @skipif_not_pytorch_2_1
 @pytest.mark.parametrize(
     "name",
@@ -724,7 +730,7 @@ def test_litgpt_variants(name, device):
     "device",
     ("cpu", "cuda"),
 )
-def test_litgpt_variants_kvcache(name, device):
+def test_litgpt_variants_kvcache(name, device, turn_off_tf32_and_set_seed):
     from thunder.tests.litgpt_model import Config
     from litgpt.model import GPT
     import torch._dynamo  # this monkeypatches torch.manual_seed
diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
@@ -36,7 +36,7 @@
 
 # see https://docs.pytest.org/en/stable/how-to/capture-warnings.html#recwarn for the recwarn fixture
 @instantiate(dtypes=(thunder.float32,), executors=all_test_executors_and_dynamo)
-def test_nanogpt_complete(executor, device, dtype, recwarn):
+def test_nanogpt_complete(executor, device, dtype, recwarn, turn_off_tf32_and_set_seed):
     tdtype = ttorch.to_torch_dtype(dtype)
     make = partial(make_tensor, dtype=torch.int64, device=device)
 
diff --git a/thunder/tests/test_update_aliases.py b/thunder/tests/test_update_aliases.py
@@ -82,14 +82,6 @@ def inplace_masked_fill_sample_generator(op, device, dtype, requires_grad, **kwa
         _inplace_opinfos.append(inplace_opinfo)
 
 
-@pytest.fixture
-def turn_off_tf32_and_set_seed(monkeypatch):
-    monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
-    torch.manual_seed(42)
-    yield
-    torch.seed()
-
-
 @instantiate(
     dtypes=(thunder.float32, thunder.float64),
     devicetypes=(devices.DeviceType.CUDA,),