diff --git a/thunder/tests/conftest.py b/thunder/tests/conftest.py index af29e3fdec..36d3aec0b1 100644 --- a/thunder/tests/conftest.py +++ b/thunder/tests/conftest.py @@ -77,3 +77,11 @@ def pytest_collection_modifyitems(items): def pytest_addoption(parser): parser.addoption("--gpu-mem-limit", type=float) + + +@pytest.fixture +def turn_off_tf32_and_set_seed(monkeypatch): + monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0") + torch.manual_seed(42) + yield + torch.seed() diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py index ea734d3623..8217f80bea 100644 --- a/thunder/tests/distributed/test_tensor_parallel.py +++ b/thunder/tests/distributed/test_tensor_parallel.py @@ -131,6 +131,10 @@ def forward(self, x): actual=tp_jitted_model.get_parameter("embed.weight").grad, ) + # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs + # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. + # This is expected due to the reduced precision of TF32 matrix multiplications. + @pytest.mark.usefixtures("turn_off_tf32_and_set_seed") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="") @common_utils.parametrize("bias", (True, False)) def test_both_column_and_row(self, bias): @@ -154,6 +158,7 @@ def forward(self, x): return h device = torch.device("cuda", self.rank) + x = torch.randint(0, num_embeddings - 1, (16, 16), device=device) x_ref = x.clone().detach() diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py index e6f294acf3..1a95ad0f8b 100644 --- a/thunder/tests/test_grad.py +++ b/thunder/tests/test_grad.py @@ -1487,8 +1487,11 @@ def test_populate_grads_block(executor, device, dtype): assert_close(torch_grads, thunder_grads, atol=1e-2, rtol=1e-2) +# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs +# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. +# This is expected due to the reduced precision of TF32 matrix multiplications. @instantiate(dtypes=(thunder.float32,)) -def test_populate_grads_nanogpt(executor, device, dtype): +def test_populate_grads_nanogpt(executor, device, dtype, turn_off_tf32_and_set_seed): import sys if sys.platform == "win32": diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py index cf54b741e3..67ded25a8b 100644 --- a/thunder/tests/test_jit_general.py +++ b/thunder/tests/test_jit_general.py @@ -649,6 +649,9 @@ def test_nanogpt(): assert_close(result, module(*args, **kwargs)) +# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs +# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. +# This is expected due to the reduced precision of TF32 matrix multiplications. @skipif_not_pytorch_2_1 @pytest.mark.parametrize( "name", @@ -668,7 +671,7 @@ def test_nanogpt(): "device", ("cpu", "cuda", "meta"), ) -def test_litgpt_variants(name, device): +def test_litgpt_variants(name, device, turn_off_tf32_and_set_seed): from thunder.tests.litgpt_model import Config from litgpt.model import GPT @@ -704,6 +707,9 @@ def test_litgpt_variants(name, device): torch.testing.assert_close(param1.grad, param2.grad, rtol=1e-2, atol=1e-2) +# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs +# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. +# This is expected due to the reduced precision of TF32 matrix multiplications. @skipif_not_pytorch_2_1 @pytest.mark.parametrize( "name", @@ -724,7 +730,7 @@ def test_litgpt_variants(name, device): "device", ("cpu", "cuda"), ) -def test_litgpt_variants_kvcache(name, device): +def test_litgpt_variants_kvcache(name, device, turn_off_tf32_and_set_seed): from thunder.tests.litgpt_model import Config from litgpt.model import GPT import torch._dynamo # this monkeypatches torch.manual_seed diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py index 23892ad45c..6b3f257930 100644 --- a/thunder/tests/test_networks.py +++ b/thunder/tests/test_networks.py @@ -36,7 +36,7 @@ # see https://docs.pytest.org/en/stable/how-to/capture-warnings.html#recwarn for the recwarn fixture @instantiate(dtypes=(thunder.float32,), executors=all_test_executors_and_dynamo) -def test_nanogpt_complete(executor, device, dtype, recwarn): +def test_nanogpt_complete(executor, device, dtype, recwarn, turn_off_tf32_and_set_seed): tdtype = ttorch.to_torch_dtype(dtype) make = partial(make_tensor, dtype=torch.int64, device=device) diff --git a/thunder/tests/test_update_aliases.py b/thunder/tests/test_update_aliases.py index 6eaa2fdfba..d87bf2db4d 100644 --- a/thunder/tests/test_update_aliases.py +++ b/thunder/tests/test_update_aliases.py @@ -82,14 +82,6 @@ def inplace_masked_fill_sample_generator(op, device, dtype, requires_grad, **kwa _inplace_opinfos.append(inplace_opinfo) -@pytest.fixture -def turn_off_tf32_and_set_seed(monkeypatch): - monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0") - torch.manual_seed(42) - yield - torch.seed() - - @instantiate( dtypes=(thunder.float32, thunder.float64), devicetypes=(devices.DeviceType.CUDA,),