Lightning-AI · t-vi · Oct 7, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
@@ -154,6 +154,12 @@ def forward(self, x):
                 return h
 
         device = torch.device("cuda", self.rank)
+
+        # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+        # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+        # This is expected due to the reduced precision of TF32 matrix multiplications.
+        torch.backends.cuda.matmul.allow_tf32 = False
+
         x = torch.randint(0, num_embeddings - 1, (16, 16), device=device)
         x_ref = x.clone().detach()
 

@@ -1500,6 +1500,12 @@ def test_populate_grads_nanogpt(executor, device, dtype):
 
     from thunder.benchmarks import NanoGPTBenchmark, NanoGPTConfig
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    if torch.device(device).type == "cuda":
+        torch.backends.cuda.matmul.allow_tf32 = False
+
     # NOTE Currently setting dropout to zero for reproducibility
     config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64)
 

@@ -672,6 +672,12 @@ def test_litgpt_variants(name, device):
     from thunder.tests.litgpt_model import Config
     from litgpt.model import GPT
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    if device == "cuda":
+        torch.backends.cuda.matmul.allow_tf32 = False
+
     if device == "cuda" and not torch.cuda.is_available():
         pytest.skip("CUDA not available")
 
@@ -734,6 +740,12 @@ def test_litgpt_variants_kvcache(name, device):
     if IS_WINDOWS:
         pytest.skip("slow on windows")
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    if device == "cuda":
+        torch.backends.cuda.matmul.allow_tf32 = False
+
     device = torch.device(device)
     x = torch.randint(0, 200, (1, 2), device=device)
     config = Config.from_name(name)

@@ -42,6 +42,12 @@ def test_nanogpt_complete(executor, device, dtype, recwarn):
     tdtype = ttorch.to_torch_dtype(dtype)
     make = partial(make_tensor, dtype=torch.int64, device=device)
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    if torch.device(device).type == "cuda":
+        torch.backends.cuda.matmul.allow_tf32 = False
+
     # Creates a nanoGPT model with a smaller size than any of the default options for testing
     # NOTE Sets dropout to zero for reproducibility
     config = nanogpt_model.GPTConfig(dropout=0, block_size=512, n_layer=6, n_head=6, n_embd=768)