From a5c6a4841de91485282b3e062bc66d55c1583084 Mon Sep 17 00:00:00 2001 From: Kaixi Matteo Chen Date: Wed, 1 Oct 2025 15:11:10 +0000 Subject: [PATCH 1/7] Disabled TF32 on Amper+ devices to stabilize numeric accuracy --- thunder/tests/distributed/test_tensor_parallel.py | 6 ++++++ thunder/tests/test_grad.py | 6 ++++++ thunder/tests/test_jit_general.py | 12 ++++++++++++ thunder/tests/test_networks.py | 6 ++++++ 4 files changed, 30 insertions(+) diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py index ea734d3623..2c26e318a3 100644 --- a/thunder/tests/distributed/test_tensor_parallel.py +++ b/thunder/tests/distributed/test_tensor_parallel.py @@ -154,6 +154,12 @@ def forward(self, x): return h device = torch.device("cuda", self.rank) + + # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs + # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. + # This is expected due to the reduced precision of TF32 matrix multiplications. + torch.backends.cuda.matmul.fp32_precision = 'ieee' + x = torch.randint(0, num_embeddings - 1, (16, 16), device=device) x_ref = x.clone().detach() diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py index e6f294acf3..770e3f6169 100644 --- a/thunder/tests/test_grad.py +++ b/thunder/tests/test_grad.py @@ -1500,6 +1500,12 @@ def test_populate_grads_nanogpt(executor, device, dtype): from thunder.benchmarks import NanoGPTBenchmark, NanoGPTConfig + # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs + # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. + # This is expected due to the reduced precision of TF32 matrix multiplications. + if torch.device(device).type == "cuda": + torch.backends.cuda.matmul.fp32_precision = 'ieee' + # NOTE Currently setting dropout to zero for reproducibility config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64) diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py index cf54b741e3..a22b89dd75 100644 --- a/thunder/tests/test_jit_general.py +++ b/thunder/tests/test_jit_general.py @@ -672,6 +672,12 @@ def test_litgpt_variants(name, device): from thunder.tests.litgpt_model import Config from litgpt.model import GPT + # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs + # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. + # This is expected due to the reduced precision of TF32 matrix multiplications. + if device == "cuda": + torch.backends.cuda.matmul.fp32_precision = 'ieee' + if device == "cuda" and not torch.cuda.is_available(): pytest.skip("CUDA not available") @@ -734,6 +740,12 @@ def test_litgpt_variants_kvcache(name, device): if IS_WINDOWS: pytest.skip("slow on windows") + # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs + # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. + # This is expected due to the reduced precision of TF32 matrix multiplications. + if device == "cuda": + torch.backends.cuda.matmul.fp32_precision = 'ieee' + device = torch.device(device) x = torch.randint(0, 200, (1, 2), device=device) config = Config.from_name(name) diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py index dc0b29eca7..32b520ec49 100644 --- a/thunder/tests/test_networks.py +++ b/thunder/tests/test_networks.py @@ -42,6 +42,12 @@ def test_nanogpt_complete(executor, device, dtype, recwarn): tdtype = ttorch.to_torch_dtype(dtype) make = partial(make_tensor, dtype=torch.int64, device=device) + # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs + # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. + # This is expected due to the reduced precision of TF32 matrix multiplications. + if torch.device(device).type == "cuda": + torch.backends.cuda.matmul.fp32_precision = 'ieee' + # Creates a nanoGPT model with a smaller size than any of the default options for testing # NOTE Sets dropout to zero for reproducibility config = nanogpt_model.GPTConfig(dropout=0, block_size=512, n_layer=6, n_head=6, n_embd=768) From 66f1d0b9c9cc6b3ad95076c1e4deca725f956e5c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 1 Oct 2025 15:28:18 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- thunder/tests/distributed/test_tensor_parallel.py | 2 +- thunder/tests/test_grad.py | 2 +- thunder/tests/test_jit_general.py | 4 ++-- thunder/tests/test_networks.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py index 2c26e318a3..f8f474ecf8 100644 --- a/thunder/tests/distributed/test_tensor_parallel.py +++ b/thunder/tests/distributed/test_tensor_parallel.py @@ -158,7 +158,7 @@ def forward(self, x): # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. - torch.backends.cuda.matmul.fp32_precision = 'ieee' + torch.backends.cuda.matmul.fp32_precision = "ieee" x = torch.randint(0, num_embeddings - 1, (16, 16), device=device) x_ref = x.clone().detach() diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py index 770e3f6169..4729473133 100644 --- a/thunder/tests/test_grad.py +++ b/thunder/tests/test_grad.py @@ -1504,7 +1504,7 @@ def test_populate_grads_nanogpt(executor, device, dtype): # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. if torch.device(device).type == "cuda": - torch.backends.cuda.matmul.fp32_precision = 'ieee' + torch.backends.cuda.matmul.fp32_precision = "ieee" # NOTE Currently setting dropout to zero for reproducibility config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64) diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py index a22b89dd75..242ce39f73 100644 --- a/thunder/tests/test_jit_general.py +++ b/thunder/tests/test_jit_general.py @@ -676,7 +676,7 @@ def test_litgpt_variants(name, device): # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. if device == "cuda": - torch.backends.cuda.matmul.fp32_precision = 'ieee' + torch.backends.cuda.matmul.fp32_precision = "ieee" if device == "cuda" and not torch.cuda.is_available(): pytest.skip("CUDA not available") @@ -744,7 +744,7 @@ def test_litgpt_variants_kvcache(name, device): # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. if device == "cuda": - torch.backends.cuda.matmul.fp32_precision = 'ieee' + torch.backends.cuda.matmul.fp32_precision = "ieee" device = torch.device(device) x = torch.randint(0, 200, (1, 2), device=device) diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py index 32b520ec49..7bdf42f71e 100644 --- a/thunder/tests/test_networks.py +++ b/thunder/tests/test_networks.py @@ -46,7 +46,7 @@ def test_nanogpt_complete(executor, device, dtype, recwarn): # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. if torch.device(device).type == "cuda": - torch.backends.cuda.matmul.fp32_precision = 'ieee' + torch.backends.cuda.matmul.fp32_precision = "ieee" # Creates a nanoGPT model with a smaller size than any of the default options for testing # NOTE Sets dropout to zero for reproducibility From d115e4a5cf9ebd04bded1c24cfb08b036be5496f Mon Sep 17 00:00:00 2001 From: Kaixi Matteo Chen Date: Wed, 1 Oct 2025 15:43:35 +0000 Subject: [PATCH 3/7] Reverted to old TF32 API --- thunder/tests/distributed/test_tensor_parallel.py | 2 +- thunder/tests/test_grad.py | 2 +- thunder/tests/test_jit_general.py | 4 ++-- thunder/tests/test_networks.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py index f8f474ecf8..c13aaa2a84 100644 --- a/thunder/tests/distributed/test_tensor_parallel.py +++ b/thunder/tests/distributed/test_tensor_parallel.py @@ -158,7 +158,7 @@ def forward(self, x): # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. - torch.backends.cuda.matmul.fp32_precision = "ieee" + torch.backends.cuda.matmul.allow_tf32 = False x = torch.randint(0, num_embeddings - 1, (16, 16), device=device) x_ref = x.clone().detach() diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py index 4729473133..5cef6e1e51 100644 --- a/thunder/tests/test_grad.py +++ b/thunder/tests/test_grad.py @@ -1504,7 +1504,7 @@ def test_populate_grads_nanogpt(executor, device, dtype): # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. if torch.device(device).type == "cuda": - torch.backends.cuda.matmul.fp32_precision = "ieee" + torch.backends.cuda.matmul.allow_tf32 = False # NOTE Currently setting dropout to zero for reproducibility config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64) diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py index 242ce39f73..bf403e544c 100644 --- a/thunder/tests/test_jit_general.py +++ b/thunder/tests/test_jit_general.py @@ -676,7 +676,7 @@ def test_litgpt_variants(name, device): # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. if device == "cuda": - torch.backends.cuda.matmul.fp32_precision = "ieee" + torch.backends.cuda.matmul.allow_tf32 = False if device == "cuda" and not torch.cuda.is_available(): pytest.skip("CUDA not available") @@ -744,7 +744,7 @@ def test_litgpt_variants_kvcache(name, device): # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. if device == "cuda": - torch.backends.cuda.matmul.fp32_precision = "ieee" + torch.backends.cuda.matmul.allow_tf32 = False device = torch.device(device) x = torch.randint(0, 200, (1, 2), device=device) diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py index 7bdf42f71e..c454f8c6e3 100644 --- a/thunder/tests/test_networks.py +++ b/thunder/tests/test_networks.py @@ -46,7 +46,7 @@ def test_nanogpt_complete(executor, device, dtype, recwarn): # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. if torch.device(device).type == "cuda": - torch.backends.cuda.matmul.fp32_precision = "ieee" + torch.backends.cuda.matmul.allow_tf32 = False # Creates a nanoGPT model with a smaller size than any of the default options for testing # NOTE Sets dropout to zero for reproducibility From 878185223bd68a8321452b65fc22d24180335d78 Mon Sep 17 00:00:00 2001 From: Kaixi Matteo Chen Date: Thu, 2 Oct 2025 09:27:26 +0000 Subject: [PATCH 4/7] Switched to pytest fixture --- .../tests/distributed/test_tensor_parallel.py | 10 ++++---- thunder/tests/test_grad.py | 13 ++++------ thunder/tests/test_jit_general.py | 24 +++++++------------ thunder/tests/test_networks.py | 9 ++----- thunder/tests/test_update_aliases.py | 9 +------ thunder/tests/utils.py | 8 +++++++ 6 files changed, 30 insertions(+), 43 deletions(-) diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py index c13aaa2a84..acf71a7ff3 100644 --- a/thunder/tests/distributed/test_tensor_parallel.py +++ b/thunder/tests/distributed/test_tensor_parallel.py @@ -12,6 +12,7 @@ import thunder.executors from thunder.tests.distributed.helper import ToyModel, DistributedParallelTestCase from thunder.tests.distributed.modules import ParallelMLP +from thunder.tests.utils import turn_off_tf32_and_set_seed from torch.testing._internal import common_utils @@ -131,6 +132,10 @@ def forward(self, x): actual=tp_jitted_model.get_parameter("embed.weight").grad, ) + # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs + # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. + # This is expected due to the reduced precision of TF32 matrix multiplications. + @pytest.mark.usefixtures("turn_off_tf32_and_set_seed") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="") @common_utils.parametrize("bias", (True, False)) def test_both_column_and_row(self, bias): @@ -155,11 +160,6 @@ def forward(self, x): device = torch.device("cuda", self.rank) - # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs - # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. - # This is expected due to the reduced precision of TF32 matrix multiplications. - torch.backends.cuda.matmul.allow_tf32 = False - x = torch.randint(0, num_embeddings - 1, (16, 16), device=device) x_ref = x.clone().detach() diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py index 5cef6e1e51..24effb6615 100644 --- a/thunder/tests/test_grad.py +++ b/thunder/tests/test_grad.py @@ -30,7 +30,7 @@ ) from thunder.tests.make_tensor import make_tensor, make_tensor_like from thunder.tests.opinfos import get_opinfo, opinfos, tensor_creation_ops -from thunder.tests.utils import is_output_differentiable, filter_differentiable_outputs +from thunder.tests.utils import is_output_differentiable, filter_differentiable_outputs, turn_off_tf32_and_set_seed # TODO: Move this to thunder.tests.opinfos op_skip = { @@ -1487,8 +1487,11 @@ def test_populate_grads_block(executor, device, dtype): assert_close(torch_grads, thunder_grads, atol=1e-2, rtol=1e-2) +# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs +# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. +# This is expected due to the reduced precision of TF32 matrix multiplications. @instantiate(dtypes=(thunder.float32,)) -def test_populate_grads_nanogpt(executor, device, dtype): +def test_populate_grads_nanogpt(executor, device, dtype, turn_off_tf32_and_set_seed): import sys if sys.platform == "win32": @@ -1500,12 +1503,6 @@ def test_populate_grads_nanogpt(executor, device, dtype): from thunder.benchmarks import NanoGPTBenchmark, NanoGPTConfig - # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs - # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. - # This is expected due to the reduced precision of TF32 matrix multiplications. - if torch.device(device).type == "cuda": - torch.backends.cuda.matmul.allow_tf32 = False - # NOTE Currently setting dropout to zero for reproducibility config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64) diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py index bf403e544c..df22cbfd44 100644 --- a/thunder/tests/test_jit_general.py +++ b/thunder/tests/test_jit_general.py @@ -14,6 +14,7 @@ import thunder from thunder.tests.framework import requiresCUDA, IS_WINDOWS +from thunder.tests.utils import turn_off_tf32_and_set_seed from thunder.core.options import CACHE_OPTIONS import thunder.core.prims as prims from thunder import pytorch_executor, nvfuser_executor @@ -649,6 +650,9 @@ def test_nanogpt(): assert_close(result, module(*args, **kwargs)) +# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs +# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. +# This is expected due to the reduced precision of TF32 matrix multiplications. @skipif_not_pytorch_2_1 @pytest.mark.parametrize( "name", @@ -668,16 +672,10 @@ def test_nanogpt(): "device", ("cpu", "cuda", "meta"), ) -def test_litgpt_variants(name, device): +def test_litgpt_variants(name, device, turn_off_tf32_and_set_seed): from thunder.tests.litgpt_model import Config from litgpt.model import GPT - # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs - # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. - # This is expected due to the reduced precision of TF32 matrix multiplications. - if device == "cuda": - torch.backends.cuda.matmul.allow_tf32 = False - if device == "cuda" and not torch.cuda.is_available(): pytest.skip("CUDA not available") @@ -709,7 +707,9 @@ def test_litgpt_variants(name, device): assert param1.grad is not None torch.testing.assert_close(param1.grad, param2.grad, rtol=1e-2, atol=1e-2) - +# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs +# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. +# This is expected due to the reduced precision of TF32 matrix multiplications. @skipif_not_pytorch_2_1 @pytest.mark.parametrize( "name", @@ -730,7 +730,7 @@ def test_litgpt_variants(name, device): "device", ("cpu", "cuda"), ) -def test_litgpt_variants_kvcache(name, device): +def test_litgpt_variants_kvcache(name, device, turn_off_tf32_and_set_seed): from thunder.tests.litgpt_model import Config from litgpt.model import GPT import torch._dynamo # this monkeypatches torch.manual_seed @@ -740,12 +740,6 @@ def test_litgpt_variants_kvcache(name, device): if IS_WINDOWS: pytest.skip("slow on windows") - # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs - # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. - # This is expected due to the reduced precision of TF32 matrix multiplications. - if device == "cuda": - torch.backends.cuda.matmul.allow_tf32 = False - device = torch.device(device) x = torch.randint(0, 200, (1, 2), device=device) config = Config.from_name(name) diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py index c454f8c6e3..d96b04d260 100644 --- a/thunder/tests/test_networks.py +++ b/thunder/tests/test_networks.py @@ -10,6 +10,7 @@ import torch import torch.nn as nn from torch.testing import assert_close, make_tensor +from thunder.tests.utils import turn_off_tf32_and_set_seed import thunder import thunder.torch as ttorch @@ -38,16 +39,10 @@ # see https://docs.pytest.org/en/stable/how-to/capture-warnings.html#recwarn for the recwarn fixture @instantiate(dtypes=(thunder.float32,), executors=all_test_executors_and_dynamo) -def test_nanogpt_complete(executor, device, dtype, recwarn): +def test_nanogpt_complete(executor, device, dtype, recwarn, turn_off_tf32_and_set_seed): tdtype = ttorch.to_torch_dtype(dtype) make = partial(make_tensor, dtype=torch.int64, device=device) - # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs - # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. - # This is expected due to the reduced precision of TF32 matrix multiplications. - if torch.device(device).type == "cuda": - torch.backends.cuda.matmul.allow_tf32 = False - # Creates a nanoGPT model with a smaller size than any of the default options for testing # NOTE Sets dropout to zero for reproducibility config = nanogpt_model.GPTConfig(dropout=0, block_size=512, n_layer=6, n_head=6, n_embd=768) diff --git a/thunder/tests/test_update_aliases.py b/thunder/tests/test_update_aliases.py index 6eaa2fdfba..3b61717911 100644 --- a/thunder/tests/test_update_aliases.py +++ b/thunder/tests/test_update_aliases.py @@ -12,6 +12,7 @@ from thunder.core.symbol import Symbol import thunder.core.devices as devices from thunder.tests.opinfos import opinfos, OpInfo, make_number, SampleInput +from thunder.tests.utils import turn_off_tf32_and_set_seed from thunder.tests.make_tensor import make_tensor, make_tensor_like from thunder.tests.framework import ( instantiate, @@ -82,14 +83,6 @@ def inplace_masked_fill_sample_generator(op, device, dtype, requires_grad, **kwa _inplace_opinfos.append(inplace_opinfo) -@pytest.fixture -def turn_off_tf32_and_set_seed(monkeypatch): - monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0") - torch.manual_seed(42) - yield - torch.seed() - - @instantiate( dtypes=(thunder.float32, thunder.float64), devicetypes=(devices.DeviceType.CUDA,), diff --git a/thunder/tests/utils.py b/thunder/tests/utils.py index f68d516314..9bca755ae6 100644 --- a/thunder/tests/utils.py +++ b/thunder/tests/utils.py @@ -53,3 +53,11 @@ def wrapped_fn(*args, **kwargs): fn(*args, **kwargs) return wrapped_fn + + +@pytest.fixture +def turn_off_tf32_and_set_seed(monkeypatch): + monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0") + torch.manual_seed(42) + yield + torch.seed() \ No newline at end of file From 87d44e4bd59b822eeb88bccb10a028620539202f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:27:51 +0000 Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- thunder/tests/distributed/test_tensor_parallel.py | 1 - thunder/tests/test_grad.py | 2 +- thunder/tests/test_jit_general.py | 2 +- thunder/tests/test_networks.py | 1 - thunder/tests/test_update_aliases.py | 1 - thunder/tests/utils.py | 2 +- 6 files changed, 3 insertions(+), 6 deletions(-) diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py index acf71a7ff3..8217f80bea 100644 --- a/thunder/tests/distributed/test_tensor_parallel.py +++ b/thunder/tests/distributed/test_tensor_parallel.py @@ -12,7 +12,6 @@ import thunder.executors from thunder.tests.distributed.helper import ToyModel, DistributedParallelTestCase from thunder.tests.distributed.modules import ParallelMLP -from thunder.tests.utils import turn_off_tf32_and_set_seed from torch.testing._internal import common_utils diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py index 24effb6615..1a95ad0f8b 100644 --- a/thunder/tests/test_grad.py +++ b/thunder/tests/test_grad.py @@ -30,7 +30,7 @@ ) from thunder.tests.make_tensor import make_tensor, make_tensor_like from thunder.tests.opinfos import get_opinfo, opinfos, tensor_creation_ops -from thunder.tests.utils import is_output_differentiable, filter_differentiable_outputs, turn_off_tf32_and_set_seed +from thunder.tests.utils import is_output_differentiable, filter_differentiable_outputs # TODO: Move this to thunder.tests.opinfos op_skip = { diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py index df22cbfd44..67ded25a8b 100644 --- a/thunder/tests/test_jit_general.py +++ b/thunder/tests/test_jit_general.py @@ -14,7 +14,6 @@ import thunder from thunder.tests.framework import requiresCUDA, IS_WINDOWS -from thunder.tests.utils import turn_off_tf32_and_set_seed from thunder.core.options import CACHE_OPTIONS import thunder.core.prims as prims from thunder import pytorch_executor, nvfuser_executor @@ -707,6 +706,7 @@ def test_litgpt_variants(name, device, turn_off_tf32_and_set_seed): assert param1.grad is not None torch.testing.assert_close(param1.grad, param2.grad, rtol=1e-2, atol=1e-2) + # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close. # This is expected due to the reduced precision of TF32 matrix multiplications. diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py index d96b04d260..365750e547 100644 --- a/thunder/tests/test_networks.py +++ b/thunder/tests/test_networks.py @@ -10,7 +10,6 @@ import torch import torch.nn as nn from torch.testing import assert_close, make_tensor -from thunder.tests.utils import turn_off_tf32_and_set_seed import thunder import thunder.torch as ttorch diff --git a/thunder/tests/test_update_aliases.py b/thunder/tests/test_update_aliases.py index 3b61717911..d87bf2db4d 100644 --- a/thunder/tests/test_update_aliases.py +++ b/thunder/tests/test_update_aliases.py @@ -12,7 +12,6 @@ from thunder.core.symbol import Symbol import thunder.core.devices as devices from thunder.tests.opinfos import opinfos, OpInfo, make_number, SampleInput -from thunder.tests.utils import turn_off_tf32_and_set_seed from thunder.tests.make_tensor import make_tensor, make_tensor_like from thunder.tests.framework import ( instantiate, diff --git a/thunder/tests/utils.py b/thunder/tests/utils.py index 9bca755ae6..12a9760beb 100644 --- a/thunder/tests/utils.py +++ b/thunder/tests/utils.py @@ -60,4 +60,4 @@ def turn_off_tf32_and_set_seed(monkeypatch): monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0") torch.manual_seed(42) yield - torch.seed() \ No newline at end of file + torch.seed() From 87e2324c793f59fe55aab232a7c0fcdd61e937df Mon Sep 17 00:00:00 2001 From: Kaixi Matteo Chen Date: Thu, 2 Oct 2025 09:52:55 +0000 Subject: [PATCH 6/7] Moved fixture location --- thunder/tests/conftest.py | 7 +++++++ thunder/tests/utils.py | 8 -------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/thunder/tests/conftest.py b/thunder/tests/conftest.py index af29e3fdec..8db84c9adf 100644 --- a/thunder/tests/conftest.py +++ b/thunder/tests/conftest.py @@ -77,3 +77,10 @@ def pytest_collection_modifyitems(items): def pytest_addoption(parser): parser.addoption("--gpu-mem-limit", type=float) + +@pytest.fixture +def turn_off_tf32_and_set_seed(monkeypatch): + monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0") + torch.manual_seed(42) + yield + torch.seed() diff --git a/thunder/tests/utils.py b/thunder/tests/utils.py index 12a9760beb..f68d516314 100644 --- a/thunder/tests/utils.py +++ b/thunder/tests/utils.py @@ -53,11 +53,3 @@ def wrapped_fn(*args, **kwargs): fn(*args, **kwargs) return wrapped_fn - - -@pytest.fixture -def turn_off_tf32_and_set_seed(monkeypatch): - monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0") - torch.manual_seed(42) - yield - torch.seed() From 1532dd3f0451a6cf31d747726342664f413652fc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:54:29 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- thunder/tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/thunder/tests/conftest.py b/thunder/tests/conftest.py index 8db84c9adf..36d3aec0b1 100644 --- a/thunder/tests/conftest.py +++ b/thunder/tests/conftest.py @@ -78,6 +78,7 @@ def pytest_collection_modifyitems(items): def pytest_addoption(parser): parser.addoption("--gpu-mem-limit", type=float) + @pytest.fixture def turn_off_tf32_and_set_seed(monkeypatch): monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")