Skip to content
6 changes: 6 additions & 0 deletions thunder/tests/distributed/test_tensor_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ def forward(self, x):
return h

device = torch.device("cuda", self.rank)

# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
# This is expected due to the reduced precision of TF32 matrix multiplications.
torch.backends.cuda.matmul.allow_tf32 = False

x = torch.randint(0, num_embeddings - 1, (16, 16), device=device)
x_ref = x.clone().detach()

Expand Down
6 changes: 6 additions & 0 deletions thunder/tests/test_grad.py
Original file line number Diff line number Diff line change
Expand Up @@ -1500,6 +1500,12 @@ def test_populate_grads_nanogpt(executor, device, dtype):

from thunder.benchmarks import NanoGPTBenchmark, NanoGPTConfig

# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
# This is expected due to the reduced precision of TF32 matrix multiplications.
if torch.device(device).type == "cuda":
torch.backends.cuda.matmul.allow_tf32 = False

# NOTE Currently setting dropout to zero for reproducibility
config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64)

Expand Down
12 changes: 12 additions & 0 deletions thunder/tests/test_jit_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,12 @@ def test_litgpt_variants(name, device):
from thunder.tests.litgpt_model import Config
from litgpt.model import GPT

# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
# This is expected due to the reduced precision of TF32 matrix multiplications.
if device == "cuda":
torch.backends.cuda.matmul.allow_tf32 = False

if device == "cuda" and not torch.cuda.is_available():
pytest.skip("CUDA not available")

Expand Down Expand Up @@ -734,6 +740,12 @@ def test_litgpt_variants_kvcache(name, device):
if IS_WINDOWS:
pytest.skip("slow on windows")

# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
# This is expected due to the reduced precision of TF32 matrix multiplications.
if device == "cuda":
torch.backends.cuda.matmul.allow_tf32 = False

device = torch.device(device)
x = torch.randint(0, 200, (1, 2), device=device)
config = Config.from_name(name)
Expand Down
6 changes: 6 additions & 0 deletions thunder/tests/test_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ def test_nanogpt_complete(executor, device, dtype, recwarn):
tdtype = ttorch.to_torch_dtype(dtype)
make = partial(make_tensor, dtype=torch.int64, device=device)

# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
# This is expected due to the reduced precision of TF32 matrix multiplications.
if torch.device(device).type == "cuda":
torch.backends.cuda.matmul.allow_tf32 = False

# Creates a nanoGPT model with a smaller size than any of the default options for testing
# NOTE Sets dropout to zero for reproducibility
config = nanogpt_model.GPTConfig(dropout=0, block_size=512, n_layer=6, n_head=6, n_embd=768)
Expand Down
Loading