From a5c6a4841de91485282b3e062bc66d55c1583084 Mon Sep 17 00:00:00 2001
From: Kaixi Matteo Chen <matteochen3@gmail.com>
Date: Wed, 1 Oct 2025 15:11:10 +0000
Subject: [PATCH 1/7] Disabled TF32 on Amper+ devices to stabilize numeric
 accuracy

---
 thunder/tests/distributed/test_tensor_parallel.py |  6 ++++++
 thunder/tests/test_grad.py                        |  6 ++++++
 thunder/tests/test_jit_general.py                 | 12 ++++++++++++
 thunder/tests/test_networks.py                    |  6 ++++++
 4 files changed, 30 insertions(+)

diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py
index ea734d3623..2c26e318a3 100644
--- a/thunder/tests/distributed/test_tensor_parallel.py
+++ b/thunder/tests/distributed/test_tensor_parallel.py
@@ -154,6 +154,12 @@ def forward(self, x):
                 return h
 
         device = torch.device("cuda", self.rank)
+
+        # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+        # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+        # This is expected due to the reduced precision of TF32 matrix multiplications.
+        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+
         x = torch.randint(0, num_embeddings - 1, (16, 16), device=device)
         x_ref = x.clone().detach()
 
diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py
index e6f294acf3..770e3f6169 100644
--- a/thunder/tests/test_grad.py
+++ b/thunder/tests/test_grad.py
@@ -1500,6 +1500,12 @@ def test_populate_grads_nanogpt(executor, device, dtype):
 
     from thunder.benchmarks import NanoGPTBenchmark, NanoGPTConfig
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    if torch.device(device).type == "cuda":
+        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+
     # NOTE Currently setting dropout to zero for reproducibility
     config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64)
 
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
index cf54b741e3..a22b89dd75 100644
--- a/thunder/tests/test_jit_general.py
+++ b/thunder/tests/test_jit_general.py
@@ -672,6 +672,12 @@ def test_litgpt_variants(name, device):
     from thunder.tests.litgpt_model import Config
     from litgpt.model import GPT
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    if device == "cuda":
+        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+
     if device == "cuda" and not torch.cuda.is_available():
         pytest.skip("CUDA not available")
 
@@ -734,6 +740,12 @@ def test_litgpt_variants_kvcache(name, device):
     if IS_WINDOWS:
         pytest.skip("slow on windows")
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    if device == "cuda":
+        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+
     device = torch.device(device)
     x = torch.randint(0, 200, (1, 2), device=device)
     config = Config.from_name(name)
diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index dc0b29eca7..32b520ec49 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -42,6 +42,12 @@ def test_nanogpt_complete(executor, device, dtype, recwarn):
     tdtype = ttorch.to_torch_dtype(dtype)
     make = partial(make_tensor, dtype=torch.int64, device=device)
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    if torch.device(device).type == "cuda":
+        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+
     # Creates a nanoGPT model with a smaller size than any of the default options for testing
     # NOTE Sets dropout to zero for reproducibility
     config = nanogpt_model.GPTConfig(dropout=0, block_size=512, n_layer=6, n_head=6, n_embd=768)

From 66f1d0b9c9cc6b3ad95076c1e4deca725f956e5c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Oct 2025 15:28:18 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 thunder/tests/distributed/test_tensor_parallel.py | 2 +-
 thunder/tests/test_grad.py                        | 2 +-
 thunder/tests/test_jit_general.py                 | 4 ++--
 thunder/tests/test_networks.py                    | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py
index 2c26e318a3..f8f474ecf8 100644
--- a/thunder/tests/distributed/test_tensor_parallel.py
+++ b/thunder/tests/distributed/test_tensor_parallel.py
@@ -158,7 +158,7 @@ def forward(self, x):
         # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
         # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
         # This is expected due to the reduced precision of TF32 matrix multiplications.
-        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+        torch.backends.cuda.matmul.fp32_precision = "ieee"
 
         x = torch.randint(0, num_embeddings - 1, (16, 16), device=device)
         x_ref = x.clone().detach()
diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py
index 770e3f6169..4729473133 100644
--- a/thunder/tests/test_grad.py
+++ b/thunder/tests/test_grad.py
@@ -1504,7 +1504,7 @@ def test_populate_grads_nanogpt(executor, device, dtype):
     # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
     # This is expected due to the reduced precision of TF32 matrix multiplications.
     if torch.device(device).type == "cuda":
-        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+        torch.backends.cuda.matmul.fp32_precision = "ieee"
 
     # NOTE Currently setting dropout to zero for reproducibility
     config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64)
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
index a22b89dd75..242ce39f73 100644
--- a/thunder/tests/test_jit_general.py
+++ b/thunder/tests/test_jit_general.py
@@ -676,7 +676,7 @@ def test_litgpt_variants(name, device):
     # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
     # This is expected due to the reduced precision of TF32 matrix multiplications.
     if device == "cuda":
-        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+        torch.backends.cuda.matmul.fp32_precision = "ieee"
 
     if device == "cuda" and not torch.cuda.is_available():
         pytest.skip("CUDA not available")
@@ -744,7 +744,7 @@ def test_litgpt_variants_kvcache(name, device):
     # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
     # This is expected due to the reduced precision of TF32 matrix multiplications.
     if device == "cuda":
-        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+        torch.backends.cuda.matmul.fp32_precision = "ieee"
 
     device = torch.device(device)
     x = torch.randint(0, 200, (1, 2), device=device)
diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index 32b520ec49..7bdf42f71e 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -46,7 +46,7 @@ def test_nanogpt_complete(executor, device, dtype, recwarn):
     # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
     # This is expected due to the reduced precision of TF32 matrix multiplications.
     if torch.device(device).type == "cuda":
-        torch.backends.cuda.matmul.fp32_precision = 'ieee'
+        torch.backends.cuda.matmul.fp32_precision = "ieee"
 
     # Creates a nanoGPT model with a smaller size than any of the default options for testing
     # NOTE Sets dropout to zero for reproducibility

From d115e4a5cf9ebd04bded1c24cfb08b036be5496f Mon Sep 17 00:00:00 2001
From: Kaixi Matteo Chen <matteochen3@gmail.com>
Date: Wed, 1 Oct 2025 15:43:35 +0000
Subject: [PATCH 3/7] Reverted to old TF32 API

---
 thunder/tests/distributed/test_tensor_parallel.py | 2 +-
 thunder/tests/test_grad.py                        | 2 +-
 thunder/tests/test_jit_general.py                 | 4 ++--
 thunder/tests/test_networks.py                    | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py
index f8f474ecf8..c13aaa2a84 100644
--- a/thunder/tests/distributed/test_tensor_parallel.py
+++ b/thunder/tests/distributed/test_tensor_parallel.py
@@ -158,7 +158,7 @@ def forward(self, x):
         # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
         # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
         # This is expected due to the reduced precision of TF32 matrix multiplications.
-        torch.backends.cuda.matmul.fp32_precision = "ieee"
+        torch.backends.cuda.matmul.allow_tf32 = False
 
         x = torch.randint(0, num_embeddings - 1, (16, 16), device=device)
         x_ref = x.clone().detach()
diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py
index 4729473133..5cef6e1e51 100644
--- a/thunder/tests/test_grad.py
+++ b/thunder/tests/test_grad.py
@@ -1504,7 +1504,7 @@ def test_populate_grads_nanogpt(executor, device, dtype):
     # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
     # This is expected due to the reduced precision of TF32 matrix multiplications.
     if torch.device(device).type == "cuda":
-        torch.backends.cuda.matmul.fp32_precision = "ieee"
+        torch.backends.cuda.matmul.allow_tf32 = False
 
     # NOTE Currently setting dropout to zero for reproducibility
     config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64)
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
index 242ce39f73..bf403e544c 100644
--- a/thunder/tests/test_jit_general.py
+++ b/thunder/tests/test_jit_general.py
@@ -676,7 +676,7 @@ def test_litgpt_variants(name, device):
     # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
     # This is expected due to the reduced precision of TF32 matrix multiplications.
     if device == "cuda":
-        torch.backends.cuda.matmul.fp32_precision = "ieee"
+        torch.backends.cuda.matmul.allow_tf32 = False
 
     if device == "cuda" and not torch.cuda.is_available():
         pytest.skip("CUDA not available")
@@ -744,7 +744,7 @@ def test_litgpt_variants_kvcache(name, device):
     # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
     # This is expected due to the reduced precision of TF32 matrix multiplications.
     if device == "cuda":
-        torch.backends.cuda.matmul.fp32_precision = "ieee"
+        torch.backends.cuda.matmul.allow_tf32 = False
 
     device = torch.device(device)
     x = torch.randint(0, 200, (1, 2), device=device)
diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index 7bdf42f71e..c454f8c6e3 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -46,7 +46,7 @@ def test_nanogpt_complete(executor, device, dtype, recwarn):
     # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
     # This is expected due to the reduced precision of TF32 matrix multiplications.
     if torch.device(device).type == "cuda":
-        torch.backends.cuda.matmul.fp32_precision = "ieee"
+        torch.backends.cuda.matmul.allow_tf32 = False
 
     # Creates a nanoGPT model with a smaller size than any of the default options for testing
     # NOTE Sets dropout to zero for reproducibility

From 878185223bd68a8321452b65fc22d24180335d78 Mon Sep 17 00:00:00 2001
From: Kaixi Matteo Chen <matteochen3@gmail.com>
Date: Thu, 2 Oct 2025 09:27:26 +0000
Subject: [PATCH 4/7] Switched to pytest fixture

---
 .../tests/distributed/test_tensor_parallel.py | 10 ++++----
 thunder/tests/test_grad.py                    | 13 ++++------
 thunder/tests/test_jit_general.py             | 24 +++++++------------
 thunder/tests/test_networks.py                |  9 ++-----
 thunder/tests/test_update_aliases.py          |  9 +------
 thunder/tests/utils.py                        |  8 +++++++
 6 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py
index c13aaa2a84..acf71a7ff3 100644
--- a/thunder/tests/distributed/test_tensor_parallel.py
+++ b/thunder/tests/distributed/test_tensor_parallel.py
@@ -12,6 +12,7 @@
 import thunder.executors
 from thunder.tests.distributed.helper import ToyModel, DistributedParallelTestCase
 from thunder.tests.distributed.modules import ParallelMLP
+from thunder.tests.utils import turn_off_tf32_and_set_seed
 
 from torch.testing._internal import common_utils
 
@@ -131,6 +132,10 @@ def forward(self, x):
             actual=tp_jitted_model.get_parameter("embed.weight").grad,
         )
 
+    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+    # This is expected due to the reduced precision of TF32 matrix multiplications.
+    @pytest.mark.usefixtures("turn_off_tf32_and_set_seed")
     @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="")
     @common_utils.parametrize("bias", (True, False))
     def test_both_column_and_row(self, bias):
@@ -155,11 +160,6 @@ def forward(self, x):
 
         device = torch.device("cuda", self.rank)
 
-        # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
-        # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
-        # This is expected due to the reduced precision of TF32 matrix multiplications.
-        torch.backends.cuda.matmul.allow_tf32 = False
-
         x = torch.randint(0, num_embeddings - 1, (16, 16), device=device)
         x_ref = x.clone().detach()
 
diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py
index 5cef6e1e51..24effb6615 100644
--- a/thunder/tests/test_grad.py
+++ b/thunder/tests/test_grad.py
@@ -30,7 +30,7 @@
 )
 from thunder.tests.make_tensor import make_tensor, make_tensor_like
 from thunder.tests.opinfos import get_opinfo, opinfos, tensor_creation_ops
-from thunder.tests.utils import is_output_differentiable, filter_differentiable_outputs
+from thunder.tests.utils import is_output_differentiable, filter_differentiable_outputs, turn_off_tf32_and_set_seed
 
 # TODO: Move this to thunder.tests.opinfos
 op_skip = {
@@ -1487,8 +1487,11 @@ def test_populate_grads_block(executor, device, dtype):
     assert_close(torch_grads, thunder_grads, atol=1e-2, rtol=1e-2)
 
 
+# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+# This is expected due to the reduced precision of TF32 matrix multiplications.
 @instantiate(dtypes=(thunder.float32,))
-def test_populate_grads_nanogpt(executor, device, dtype):
+def test_populate_grads_nanogpt(executor, device, dtype, turn_off_tf32_and_set_seed):
     import sys
 
     if sys.platform == "win32":
@@ -1500,12 +1503,6 @@ def test_populate_grads_nanogpt(executor, device, dtype):
 
     from thunder.benchmarks import NanoGPTBenchmark, NanoGPTConfig
 
-    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
-    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
-    # This is expected due to the reduced precision of TF32 matrix multiplications.
-    if torch.device(device).type == "cuda":
-        torch.backends.cuda.matmul.allow_tf32 = False
-
     # NOTE Currently setting dropout to zero for reproducibility
     config = NanoGPTConfig(dropout=0, n_layer=2, n_head=1, n_embd=64)
 
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
index bf403e544c..df22cbfd44 100644
--- a/thunder/tests/test_jit_general.py
+++ b/thunder/tests/test_jit_general.py
@@ -14,6 +14,7 @@
 import thunder
 
 from thunder.tests.framework import requiresCUDA, IS_WINDOWS
+from thunder.tests.utils import turn_off_tf32_and_set_seed
 from thunder.core.options import CACHE_OPTIONS
 import thunder.core.prims as prims
 from thunder import pytorch_executor, nvfuser_executor
@@ -649,6 +650,9 @@ def test_nanogpt():
     assert_close(result, module(*args, **kwargs))
 
 
+# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+# This is expected due to the reduced precision of TF32 matrix multiplications.
 @skipif_not_pytorch_2_1
 @pytest.mark.parametrize(
     "name",
@@ -668,16 +672,10 @@ def test_nanogpt():
     "device",
     ("cpu", "cuda", "meta"),
 )
-def test_litgpt_variants(name, device):
+def test_litgpt_variants(name, device, turn_off_tf32_and_set_seed):
     from thunder.tests.litgpt_model import Config
     from litgpt.model import GPT
 
-    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
-    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
-    # This is expected due to the reduced precision of TF32 matrix multiplications.
-    if device == "cuda":
-        torch.backends.cuda.matmul.allow_tf32 = False
-
     if device == "cuda" and not torch.cuda.is_available():
         pytest.skip("CUDA not available")
 
@@ -709,7 +707,9 @@ def test_litgpt_variants(name, device):
         assert param1.grad is not None
         torch.testing.assert_close(param1.grad, param2.grad, rtol=1e-2, atol=1e-2)
 
-
+# Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
+# can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
+# This is expected due to the reduced precision of TF32 matrix multiplications.
 @skipif_not_pytorch_2_1
 @pytest.mark.parametrize(
     "name",
@@ -730,7 +730,7 @@ def test_litgpt_variants(name, device):
     "device",
     ("cpu", "cuda"),
 )
-def test_litgpt_variants_kvcache(name, device):
+def test_litgpt_variants_kvcache(name, device, turn_off_tf32_and_set_seed):
     from thunder.tests.litgpt_model import Config
     from litgpt.model import GPT
     import torch._dynamo  # this monkeypatches torch.manual_seed
@@ -740,12 +740,6 @@ def test_litgpt_variants_kvcache(name, device):
     if IS_WINDOWS:
         pytest.skip("slow on windows")
 
-    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
-    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
-    # This is expected due to the reduced precision of TF32 matrix multiplications.
-    if device == "cuda":
-        torch.backends.cuda.matmul.allow_tf32 = False
-
     device = torch.device(device)
     x = torch.randint(0, 200, (1, 2), device=device)
     config = Config.from_name(name)
diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index c454f8c6e3..d96b04d260 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -10,6 +10,7 @@
 import torch
 import torch.nn as nn
 from torch.testing import assert_close, make_tensor
+from thunder.tests.utils import turn_off_tf32_and_set_seed
 
 import thunder
 import thunder.torch as ttorch
@@ -38,16 +39,10 @@
 
 # see https://docs.pytest.org/en/stable/how-to/capture-warnings.html#recwarn for the recwarn fixture
 @instantiate(dtypes=(thunder.float32,), executors=all_test_executors_and_dynamo)
-def test_nanogpt_complete(executor, device, dtype, recwarn):
+def test_nanogpt_complete(executor, device, dtype, recwarn, turn_off_tf32_and_set_seed):
     tdtype = ttorch.to_torch_dtype(dtype)
     make = partial(make_tensor, dtype=torch.int64, device=device)
 
-    # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
-    # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
-    # This is expected due to the reduced precision of TF32 matrix multiplications.
-    if torch.device(device).type == "cuda":
-        torch.backends.cuda.matmul.allow_tf32 = False
-
     # Creates a nanoGPT model with a smaller size than any of the default options for testing
     # NOTE Sets dropout to zero for reproducibility
     config = nanogpt_model.GPTConfig(dropout=0, block_size=512, n_layer=6, n_head=6, n_embd=768)
diff --git a/thunder/tests/test_update_aliases.py b/thunder/tests/test_update_aliases.py
index 6eaa2fdfba..3b61717911 100644
--- a/thunder/tests/test_update_aliases.py
+++ b/thunder/tests/test_update_aliases.py
@@ -12,6 +12,7 @@
 from thunder.core.symbol import Symbol
 import thunder.core.devices as devices
 from thunder.tests.opinfos import opinfos, OpInfo, make_number, SampleInput
+from thunder.tests.utils import turn_off_tf32_and_set_seed
 from thunder.tests.make_tensor import make_tensor, make_tensor_like
 from thunder.tests.framework import (
     instantiate,
@@ -82,14 +83,6 @@ def inplace_masked_fill_sample_generator(op, device, dtype, requires_grad, **kwa
         _inplace_opinfos.append(inplace_opinfo)
 
 
-@pytest.fixture
-def turn_off_tf32_and_set_seed(monkeypatch):
-    monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
-    torch.manual_seed(42)
-    yield
-    torch.seed()
-
-
 @instantiate(
     dtypes=(thunder.float32, thunder.float64),
     devicetypes=(devices.DeviceType.CUDA,),
diff --git a/thunder/tests/utils.py b/thunder/tests/utils.py
index f68d516314..9bca755ae6 100644
--- a/thunder/tests/utils.py
+++ b/thunder/tests/utils.py
@@ -53,3 +53,11 @@ def wrapped_fn(*args, **kwargs):
             fn(*args, **kwargs)
 
     return wrapped_fn
+
+
+@pytest.fixture
+def turn_off_tf32_and_set_seed(monkeypatch):
+    monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
+    torch.manual_seed(42)
+    yield
+    torch.seed()
\ No newline at end of file

From 87d44e4bd59b822eeb88bccb10a028620539202f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 2 Oct 2025 09:27:51 +0000
Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 thunder/tests/distributed/test_tensor_parallel.py | 1 -
 thunder/tests/test_grad.py                        | 2 +-
 thunder/tests/test_jit_general.py                 | 2 +-
 thunder/tests/test_networks.py                    | 1 -
 thunder/tests/test_update_aliases.py              | 1 -
 thunder/tests/utils.py                            | 2 +-
 6 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/thunder/tests/distributed/test_tensor_parallel.py b/thunder/tests/distributed/test_tensor_parallel.py
index acf71a7ff3..8217f80bea 100644
--- a/thunder/tests/distributed/test_tensor_parallel.py
+++ b/thunder/tests/distributed/test_tensor_parallel.py
@@ -12,7 +12,6 @@
 import thunder.executors
 from thunder.tests.distributed.helper import ToyModel, DistributedParallelTestCase
 from thunder.tests.distributed.modules import ParallelMLP
-from thunder.tests.utils import turn_off_tf32_and_set_seed
 
 from torch.testing._internal import common_utils
 
diff --git a/thunder/tests/test_grad.py b/thunder/tests/test_grad.py
index 24effb6615..1a95ad0f8b 100644
--- a/thunder/tests/test_grad.py
+++ b/thunder/tests/test_grad.py
@@ -30,7 +30,7 @@
 )
 from thunder.tests.make_tensor import make_tensor, make_tensor_like
 from thunder.tests.opinfos import get_opinfo, opinfos, tensor_creation_ops
-from thunder.tests.utils import is_output_differentiable, filter_differentiable_outputs, turn_off_tf32_and_set_seed
+from thunder.tests.utils import is_output_differentiable, filter_differentiable_outputs
 
 # TODO: Move this to thunder.tests.opinfos
 op_skip = {
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
index df22cbfd44..67ded25a8b 100644
--- a/thunder/tests/test_jit_general.py
+++ b/thunder/tests/test_jit_general.py
@@ -14,7 +14,6 @@
 import thunder
 
 from thunder.tests.framework import requiresCUDA, IS_WINDOWS
-from thunder.tests.utils import turn_off_tf32_and_set_seed
 from thunder.core.options import CACHE_OPTIONS
 import thunder.core.prims as prims
 from thunder import pytorch_executor, nvfuser_executor
@@ -707,6 +706,7 @@ def test_litgpt_variants(name, device, turn_off_tf32_and_set_seed):
         assert param1.grad is not None
         torch.testing.assert_close(param1.grad, param2.grad, rtol=1e-2, atol=1e-2)
 
+
 # Note: When running with TF32 enabled on CUDA, the maximum absolute difference between outputs
 # can be on the order of 1e-3, which exceeds the default tolerances for torch.testing.assert_close.
 # This is expected due to the reduced precision of TF32 matrix multiplications.
diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index d96b04d260..365750e547 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -10,7 +10,6 @@
 import torch
 import torch.nn as nn
 from torch.testing import assert_close, make_tensor
-from thunder.tests.utils import turn_off_tf32_and_set_seed
 
 import thunder
 import thunder.torch as ttorch
diff --git a/thunder/tests/test_update_aliases.py b/thunder/tests/test_update_aliases.py
index 3b61717911..d87bf2db4d 100644
--- a/thunder/tests/test_update_aliases.py
+++ b/thunder/tests/test_update_aliases.py
@@ -12,7 +12,6 @@
 from thunder.core.symbol import Symbol
 import thunder.core.devices as devices
 from thunder.tests.opinfos import opinfos, OpInfo, make_number, SampleInput
-from thunder.tests.utils import turn_off_tf32_and_set_seed
 from thunder.tests.make_tensor import make_tensor, make_tensor_like
 from thunder.tests.framework import (
     instantiate,
diff --git a/thunder/tests/utils.py b/thunder/tests/utils.py
index 9bca755ae6..12a9760beb 100644
--- a/thunder/tests/utils.py
+++ b/thunder/tests/utils.py
@@ -60,4 +60,4 @@ def turn_off_tf32_and_set_seed(monkeypatch):
     monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
     torch.manual_seed(42)
     yield
-    torch.seed()
\ No newline at end of file
+    torch.seed()

From 87e2324c793f59fe55aab232a7c0fcdd61e937df Mon Sep 17 00:00:00 2001
From: Kaixi Matteo Chen <matteochen3@gmail.com>
Date: Thu, 2 Oct 2025 09:52:55 +0000
Subject: [PATCH 6/7] Moved fixture location

---
 thunder/tests/conftest.py | 7 +++++++
 thunder/tests/utils.py    | 8 --------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/thunder/tests/conftest.py b/thunder/tests/conftest.py
index af29e3fdec..8db84c9adf 100644
--- a/thunder/tests/conftest.py
+++ b/thunder/tests/conftest.py
@@ -77,3 +77,10 @@ def pytest_collection_modifyitems(items):
 
 def pytest_addoption(parser):
     parser.addoption("--gpu-mem-limit", type=float)
+
+@pytest.fixture
+def turn_off_tf32_and_set_seed(monkeypatch):
+    monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
+    torch.manual_seed(42)
+    yield
+    torch.seed()
diff --git a/thunder/tests/utils.py b/thunder/tests/utils.py
index 12a9760beb..f68d516314 100644
--- a/thunder/tests/utils.py
+++ b/thunder/tests/utils.py
@@ -53,11 +53,3 @@ def wrapped_fn(*args, **kwargs):
             fn(*args, **kwargs)
 
     return wrapped_fn
-
-
-@pytest.fixture
-def turn_off_tf32_and_set_seed(monkeypatch):
-    monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
-    torch.manual_seed(42)
-    yield
-    torch.seed()

From 1532dd3f0451a6cf31d747726342664f413652fc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 2 Oct 2025 09:54:29 +0000
Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 thunder/tests/conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/thunder/tests/conftest.py b/thunder/tests/conftest.py
index 8db84c9adf..36d3aec0b1 100644
--- a/thunder/tests/conftest.py
+++ b/thunder/tests/conftest.py
@@ -78,6 +78,7 @@ def pytest_collection_modifyitems(items):
 def pytest_addoption(parser):
     parser.addoption("--gpu-mem-limit", type=float)
 
+
 @pytest.fixture
 def turn_off_tf32_and_set_seed(monkeypatch):
     monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")