NVIDIA
diff --git a/‎tests/pytorch/test_numerics.py‎
Lines changed: 70 additions & 54 deletions b/‎tests/pytorch/test_numerics.py‎
Lines changed: 70 additions & 54 deletions
diff --git a/‎transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py‎
Lines changed: 6 additions & 6 deletions b/‎transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎transformer_engine/pytorch/distributed.py‎
Lines changed: 4 additions & 4 deletions b/‎transformer_engine/pytorch/distributed.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎transformer_engine/pytorch/graph.py‎
Lines changed: 5 additions & 3 deletions b/‎transformer_engine/pytorch/graph.py‎
Lines changed: 5 additions & 3 deletions
@@ -5,6 +5,7 @@
 import math
 import os
 from typing import Dict, List, Tuple, Optional
+import warnings
 import pytest
 import random
 
@@ -1296,14 +1297,15 @@ def test_linear_accuracy_delay_wgrad_compute(dtype, bs, model, bias, fuse_wgrad_
     ).eval()
 
     # Share params
-    with torch.no_grad():
-        te_linear_ref.weight = Parameter(te_linear.weight.clone())
-        if bias:
-            te_linear_ref.bias = Parameter(te_linear.bias.clone())
-        if fuse_wgrad_accumulation:
-            weight = getattr(te_linear, f"weight")
-            weight.main_grad = torch.rand_like(weight, dtype=torch.float32)
-            te_linear_ref.weight.main_grad = weight.main_grad.clone()
+    with warnings.catch_warnings(action="ignore", category=RuntimeWarning):
+        with torch.no_grad():
+            te_linear_ref.weight = Parameter(te_linear.weight.clone())
+            if bias:
+                te_linear_ref.bias = Parameter(te_linear.bias.clone())
+            if fuse_wgrad_accumulation:
+                weight = getattr(te_linear, f"weight")
+                weight.main_grad = torch.rand_like(weight, dtype=torch.float32)
+                te_linear_ref.weight.main_grad = weight.main_grad.clone()
 
     te_outputs = _test_granular_accuracy(te_linear, bs, dtype, config, delay_wgrad_compute=True)
     te_outputs_ref = _test_granular_accuracy(
@@ -1359,12 +1361,13 @@ def test_linear_accuracy_save_original_input(dtype, model, recipe):
         ).eval()
 
     # Share params
-    with torch.no_grad():
-        te_linear_ref.weight = Parameter(te_linear.weight.clone())
-        if fuse_wgrad_accumulation:
-            weight = getattr(te_linear, f"weight")
-            weight.main_grad = torch.rand_like(weight, dtype=torch.float32)
-            te_linear_ref.weight.main_grad = weight.main_grad.clone()
+    with warnings.catch_warnings(action="ignore", category=RuntimeWarning):
+        with torch.no_grad():
+            te_linear_ref.weight = Parameter(te_linear.weight.clone())
+            if fuse_wgrad_accumulation:
+                weight = getattr(te_linear, f"weight")
+                weight.main_grad = torch.rand_like(weight, dtype=torch.float32)
+                te_linear_ref.weight.main_grad = weight.main_grad.clone()
 
     te_outputs = _test_granular_accuracy(te_linear, bs, dtype, config, recipe=recipe)
     te_outputs_ref = _test_granular_accuracy(te_linear_ref, bs, dtype, config, recipe=recipe)
@@ -1601,17 +1604,18 @@ def test_layernorm_linear_accuracy_delay_wgrad_compute(
     ).eval()
 
     # Share params
-    with torch.no_grad():
-        ln_linear_ref.layer_norm_weight = Parameter(ln_linear.layer_norm_weight.clone())
-        if normalization != "RMSNorm":
-            ln_linear_ref.layer_norm_bias = Parameter(ln_linear.layer_norm_bias.clone())
-        ln_linear_ref.weight = Parameter(ln_linear.weight.clone())
-        if bias:
-            ln_linear_ref.bias = Parameter(ln_linear.bias.clone())
-        if fuse_wgrad_accumulation:
-            weight = getattr(ln_linear, f"weight")
-            weight.main_grad = torch.rand_like(weight, dtype=torch.float32)
-            ln_linear_ref.weight.main_grad = weight.main_grad.clone()
+    with warnings.catch_warnings(action="ignore", category=RuntimeWarning):
+        with torch.no_grad():
+            ln_linear_ref.layer_norm_weight = Parameter(ln_linear.layer_norm_weight.clone())
+            if normalization != "RMSNorm":
+                ln_linear_ref.layer_norm_bias = Parameter(ln_linear.layer_norm_bias.clone())
+            ln_linear_ref.weight = Parameter(ln_linear.weight.clone())
+            if bias:
+                ln_linear_ref.bias = Parameter(ln_linear.bias.clone())
+            if fuse_wgrad_accumulation:
+                weight = getattr(ln_linear, f"weight")
+                weight.main_grad = torch.rand_like(weight, dtype=torch.float32)
+                ln_linear_ref.weight.main_grad = weight.main_grad.clone()
 
     te_outputs = _test_granular_accuracy(ln_linear, bs, dtype, config, delay_wgrad_compute=True)
     te_outputs_ref = _test_granular_accuracy(
@@ -1739,19 +1743,24 @@ def test_layernorm_mlp_accuracy_delay_wgrad_compute(
     ).eval()
 
     # Share params
-    with torch.no_grad():
-        ln_mlp_ref.layer_norm_weight = Parameter(ln_mlp.layer_norm_weight.clone())
-        ln_mlp_ref.layer_norm_bias = Parameter(ln_mlp.layer_norm_bias.clone())
-        ln_mlp_ref.fc1_weight = Parameter(ln_mlp.fc1_weight.clone())
-        ln_mlp_ref.fc2_weight = Parameter(ln_mlp.fc2_weight.clone())
-        if bias:
-            ln_mlp_ref.fc1_bias = Parameter(ln_mlp.fc1_bias.clone())
-            ln_mlp_ref.fc2_bias = Parameter(ln_mlp.fc2_bias.clone())
-        if fuse_wgrad_accumulation:
-            ln_mlp.fc1_weight.main_grad = torch.rand_like(ln_mlp.fc1_weight, dtype=torch.float32)
-            ln_mlp_ref.fc1_weight.main_grad = ln_mlp.fc1_weight.main_grad.clone()
-            ln_mlp.fc2_weight.main_grad = torch.rand_like(ln_mlp.fc2_weight, dtype=torch.float32)
-            ln_mlp_ref.fc2_weight.main_grad = ln_mlp.fc2_weight.main_grad.clone()
+    with warnings.catch_warnings(action="ignore", category=RuntimeWarning):
+        with torch.no_grad():
+            ln_mlp_ref.layer_norm_weight = Parameter(ln_mlp.layer_norm_weight.clone())
+            ln_mlp_ref.layer_norm_bias = Parameter(ln_mlp.layer_norm_bias.clone())
+            ln_mlp_ref.fc1_weight = Parameter(ln_mlp.fc1_weight.clone())
+            ln_mlp_ref.fc2_weight = Parameter(ln_mlp.fc2_weight.clone())
+            if bias:
+                ln_mlp_ref.fc1_bias = Parameter(ln_mlp.fc1_bias.clone())
+                ln_mlp_ref.fc2_bias = Parameter(ln_mlp.fc2_bias.clone())
+            if fuse_wgrad_accumulation:
+                ln_mlp.fc1_weight.main_grad = torch.rand_like(
+                    ln_mlp.fc1_weight, dtype=torch.float32
+                )
+                ln_mlp_ref.fc1_weight.main_grad = ln_mlp.fc1_weight.main_grad.clone()
+                ln_mlp.fc2_weight.main_grad = torch.rand_like(
+                    ln_mlp.fc2_weight, dtype=torch.float32
+                )
+                ln_mlp_ref.fc2_weight.main_grad = ln_mlp.fc2_weight.main_grad.clone()
 
     te_outputs = _test_granular_accuracy(ln_mlp, bs, dtype, config, delay_wgrad_compute=True)
     te_outputs_ref = _test_granular_accuracy(
@@ -1796,14 +1805,15 @@ def test_layernorm_mlp_accuracy_checkpoint(
     ).eval()
 
     # Share params
-    with torch.no_grad():
-        ln_mlp_ref.layer_norm_weight = Parameter(ln_mlp.layer_norm_weight.clone())
-        ln_mlp_ref.layer_norm_bias = Parameter(ln_mlp.layer_norm_bias.clone())
-        ln_mlp_ref.fc1_weight = Parameter(ln_mlp.fc1_weight.clone())
-        ln_mlp_ref.fc2_weight = Parameter(ln_mlp.fc2_weight.clone())
-        if bias:
-            ln_mlp_ref.fc1_bias = Parameter(ln_mlp.fc1_bias.clone())
-            ln_mlp_ref.fc2_bias = Parameter(ln_mlp.fc2_bias.clone())
+    with warnings.catch_warnings(action="ignore", category=RuntimeWarning):
+        with torch.no_grad():
+            ln_mlp_ref.layer_norm_weight = Parameter(ln_mlp.layer_norm_weight.clone())
+            ln_mlp_ref.layer_norm_bias = Parameter(ln_mlp.layer_norm_bias.clone())
+            ln_mlp_ref.fc1_weight = Parameter(ln_mlp.fc1_weight.clone())
+            ln_mlp_ref.fc2_weight = Parameter(ln_mlp.fc2_weight.clone())
+            if bias:
+                ln_mlp_ref.fc1_bias = Parameter(ln_mlp.fc1_bias.clone())
+                ln_mlp_ref.fc2_bias = Parameter(ln_mlp.fc2_bias.clone())
 
     te_outputs = _test_granular_accuracy(ln_mlp, bs, dtype, config, delay_wgrad_compute=False)
     te_outputs_ref = _test_granular_accuracy(
@@ -1952,9 +1962,13 @@ def test_grouped_linear_accuracy(
     # Share params
     with torch.no_grad():
         for i in range(num_gemms):
-            sequential_linear[i].weight = Parameter(getattr(grouped_linear, f"weight{i}").clone())
+            sequential_linear[i].module_setattr(
+                "weight", Parameter(getattr(grouped_linear, f"weight{i}").clone())
+            )
             if bias:
-                sequential_linear[i].bias = Parameter(getattr(grouped_linear, f"bias{i}").clone())
+                sequential_linear[i].module_setattr(
+                    "bias", Parameter(getattr(grouped_linear, f"bias{i}").clone())
+                )
             if fuse_wgrad_accumulation:
                 weight_i = getattr(grouped_linear, f"weight{i}")
                 weight_i.main_grad = torch.rand_like(weight_i, dtype=torch.float32)
@@ -2096,9 +2110,13 @@ def test_grouped_linear_accuracy_save_original_input(
     # Share params
     with torch.no_grad():
         for i in range(num_gemms):
-            sequential_linear[i].weight = Parameter(getattr(grouped_linear, f"weight{i}").clone())
+            sequential_linear[i].module_setattr(
+                "weight", Parameter(getattr(grouped_linear, f"weight{i}").clone())
+            )
             if bias:
-                sequential_linear[i].bias = Parameter(getattr(grouped_linear, f"bias{i}").clone())
+                sequential_linear[i].module_setattr(
+                    "bias", Parameter(getattr(grouped_linear, f"bias{i}").clone())
+                )
             if fuse_wgrad_accumulation:
                 weight_i = getattr(grouped_linear, f"weight{i}")
                 weight_i.main_grad = torch.rand_like(weight_i, dtype=torch.float32)
@@ -2298,8 +2316,7 @@ def test_padding_grouped_linear_accuracy(
     with torch.no_grad():
         inner_grouped_linear = grouped_linear.linear_fn
         for i in range(num_gemms):
-            setattr(
-                ref_grouped_linear,
+            ref_grouped_linear.module_setattr(
                 f"weight{i}",
                 Parameter(getattr(inner_grouped_linear, f"weight{i}").clone()),
             )
@@ -2375,8 +2392,7 @@ def test_padding_grouped_linear_accuracy_save_original_input(
     with torch.no_grad():
         inner_grouped_linear = grouped_linear.linear_fn
         for i in range(num_gemms):
-            setattr(
-                ref_grouped_linear,
+            ref_grouped_linear.module_setattr(
                 f"weight{i}",
                 Parameter(getattr(inner_grouped_linear, f"weight{i}").clone()),
             )
 
@@ -482,7 +482,7 @@ def remove_extra_states_check(self, incompatible_keys):  # pylint: disable=unuse
 
         self.register_load_state_dict_post_hook(remove_extra_states_check)
 
-        self._default_setattr = self._warning_setattr
+        self._initialized = True
 
     def _load_from_state_dict(
         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
@@ -678,9 +678,9 @@ def init_fp8_metadata(self, num_gemms: int = 1) -> None:
         # assume attention uses the same fp8_group as GEMMs
         fp8_group = FP8GlobalStateManager.get_fp8_group()
 
-        self.fp8_parameters = FP8GlobalStateManager.with_fp8_parameters()
-        self.fp8 = FP8GlobalStateManager.is_fp8_enabled()
-        self.fp8_calibration = FP8GlobalStateManager.is_fp8_calibration()
+        self.fast_setattr("fp8_parameters", FP8GlobalStateManager.with_fp8_parameters())
+        self.fast_setattr("fp8", FP8GlobalStateManager.is_fp8_enabled())
+        self.fast_setattr("fp8_calibration", FP8GlobalStateManager.is_fp8_calibration())
         fp8_enabled = self.fp8 or self.fp8_calibration
         self.fp8_meta["fp8_checkpoint"] = self.fp8 or self.fp8_calibration
         if self.fp8_parameters or fp8_enabled:
@@ -705,7 +705,7 @@ def init_fp8_metadata(self, num_gemms: int = 1) -> None:
                 )
         else:
             # If fp8 isn't enabled, turn off and return.
-            self.fp8_initialized = False
+            self.fast_setattr("fp8_initialized", False)
             return
 
         if self.fp8_parameters and not self.fp8_initialized:
@@ -723,7 +723,7 @@ def init_fp8_metadata(self, num_gemms: int = 1) -> None:
 
             # Allocate scales and amaxes
             self.init_fp8_meta_tensors(fp8_recipes)
-            self.fp8_initialized = True
+            self.fast_setattr("fp8_initialized", True)
 
             self.fp8_meta["recipe"] = fp8_recipe_dpa
             if fp8_recipe != fp8_recipe_dpa:
 
@@ -729,8 +729,8 @@ def checkpoint(
     if isinstance(function, TransformerEngineBaseModule):
         # If this TE module is FSDP-wrapped, clear its FSDP group information because there's no need
         # to scatter/gather activations that we will recompute anyway.
-        setattr(function, "fsdp_wrapped", False)
-        setattr(function, "fsdp_group", None)
+        function.fast_setattr("fsdp_wrapped", False)
+        function.fast_setattr("fsdp_group", None)
 
     # Otherwise discard unused te.utils.checkpoint.checkpoint() arguments
     # and execute TE's own checkpointing
@@ -2046,7 +2046,7 @@ def prepare_te_modules_for_fsdp(fsdp_root: torch.nn.Module) -> None:
             )
         root_state = _get_module_fsdp_state(fsdp_root)
         assert root_state is not None, "Root module does not have a valid _FSDPState."
-        setattr(fsdp_root.module, "fsdp_group", root_state.process_group)
+        fsdp_root.module.fast_setattr("fsdp_group", root_state.process_group)
 
     # Iterate through all FSDP-wrapped submodules and inject FSDP information into TE modules
     fsdp_states, fsdp_modules = _get_fsdp_states_with_modules(fsdp_root)
@@ -2057,7 +2057,7 @@ def prepare_te_modules_for_fsdp(fsdp_root: torch.nn.Module) -> None:
                     "TE modules with primary weights in FP8 cannot be FSDP-wrapped. "
                     "Please initialize your model without the te.quantized_model_init(...) context."
                 )
-            setattr(fsdp_module.module, "fsdp_group", state.process_group)
+            fsdp_module.module.fast_setattr("fsdp_group", state.process_group)
 
 
 class FullyShardedDataParallel(FSDP):
 
@@ -935,16 +935,18 @@ def new_fwd(*user_args, **user_kwargs):
 
             forward = make_graphed_forward(func, func.training, graphed, func.forward, te_modules)
             if _order is None:
-                func.forward = forward
+                with warnings.catch_warnings(action="ignore", category=RuntimeWarning):
+                    func.forward = forward
                 ret.append(func)
             else:
                 ret.append(forward)
         else:
             ret.append(graphed)
 
         backward_dw_func, reset_func = make_graphed_attribute_functions(i)
-        setattr(ret[-1], "backward_dw", backward_dw_func)
-        setattr(ret[-1], "reset", reset_func)
+        with warnings.catch_warnings(action="ignore", category=RuntimeWarning):
+            setattr(ret[-1], "backward_dw", backward_dw_func)
+            setattr(ret[-1], "reset", reset_func)
 
     if just_one_callable:
         return ret[0]