Release v2.2 cherrypicks and bugfixes megatron lm (#362)

sudhu2k · Sudharshan Govindan · web-flow · commit 867687a74039 · 2025-11-10T10:37:17.000-06:00
* Ensure weight transpose is valid for FP8 training (#1596) (#276) * Update usage of weightmat before saving for backward * Added keep_fp8_weight_transpose_cache checks while updating transpose in fwd pass (#298) * Added keep_fp8_weight_transpose_cache checks while updating transpose * Added unittest for the fix * Added comment for the unit test * Fixed comment * Reverted test for single iteration, added assert statements to check for transpose cache, Modified docstring * Fixed test_numerics spacing * Added HIP Guards * Addressed PR Comments, and moved assertion statements under fp8 check * Reverting assertion to fix the dev ticket * Removed spacing --------- Co-authored-by: Sudharshan Govindan <sugovind@amd.com> * Bug fix for get_fp8_metas * Added keep_fp8_transpose_cache fix for base.py * added _fp8_metas check for None * Added comment --------- Co-authored-by: Sudharshan Govindan <sugovind@amd.com>
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -1274,13 +1274,21 @@ def test_linear_accuracy(dtype, bs, model, return_bias, bias):
 @pytest.mark.parametrize("bs", batch_sizes)
 @pytest.mark.parametrize("model", ["small"])
 @pytest.mark.parametrize("fp8_model_params", all_boolean)
-def test_fp8_linear_without_transpose_cache_accuracy(dtype, bs, model, fp8_model_params):
+@pytest.mark.parametrize("module_str", ["linear", "layernorm_mlp", "layernorm_linear"])
+def test_fp8_linear_without_transpose_cache_accuracy(dtype, bs, model, fp8_model_params, module_str):
     reset_rng_states()
     FP8GlobalStateManager.reset()
 
+    if module_str == "linear":
+        module = Linear
+    elif module_str == "layernorm_mlp":
+        module = LayerNormMLP
+    elif module_str == "layernorm_linear":
+        module = LayerNormLinear
+
     config = model_configs[model]
     with fp8_model_init(enabled=fp8_model_params):    
-        linear = Linear(
+        layer = module(
             config.hidden_size,
             4 * config.hidden_size,
             bias=True,
@@ -1289,20 +1297,17 @@ def test_fp8_linear_without_transpose_cache_accuracy(dtype, bs, model, fp8_model
             keep_fp8_weight_transpose_cache=False
         ).eval()
 
-        ref_linear = Linear(
+        reset_rng_states()
+        ref_layer = module(
             config.hidden_size,
             4 * config.hidden_size,
             bias=True,
             params_dtype=dtype,
             device="cuda",
         ).eval()
 
-    # Share params
-    with torch.no_grad():
-        ref_linear.weight = Parameter(linear.weight.clone())
-        ref_linear.bias = Parameter(linear.bias.clone())
-    outputs = _test_granular_accuracy_with_fp8(linear, bs, dtype, config)
-    ref_outputs = _test_granular_accuracy_with_fp8(ref_linear, bs, dtype, config)
+    outputs = _test_granular_accuracy_with_fp8(layer, bs, dtype, config)
+    ref_outputs = _test_granular_accuracy_with_fp8(ref_layer, bs, dtype, config)
 
     # Check output.
     for te_output, torch_output in zip(outputs, ref_outputs):
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
@@ -1009,7 +1009,7 @@ def get_weight_workspace(
             if update_workspace and quantizer is not None:
                 tensor.update_usage(
                     rowwise_usage=quantizer.rowwise_usage,
-                    columnwise_usage=quantizer.columnwise_usage,
+                    columnwise_usage=quantizer.columnwise_usage and create_transpose_cache,
                 )
             return tensor
 
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -339,9 +339,10 @@ def forward(
                     if isinstance(ln_out, MXFP8TensorBase) or not ctx.ln_out_needs_gather:
                         ln_out.update_usage(rowwise_usage=False)
 
-            # Weight with column-wise usage is needed for dgrad GEMM.
-            if isinstance(weightmat, QuantizedTensor):
-                weightmat.update_usage(columnwise_usage=True)
+            # Weight with column-wise usage is needed for dgrad GEMM while keeping fp8 weight transpose cache.
+            if inp.requires_grad and keep_fp8_weight_transpose_cache:
+                if isinstance(weightmat, QuantizedTensor):
+                    weightmat.update_usage(columnwise_usage=True)
 
             if cpu_offloading:
                 if fp8 and weightmat is not None:
@@ -975,10 +976,20 @@ class LayerNormLinear(TransformerEngineBaseModule):
                   it controls the type used to allocate the initial parameters. Useful when
                   the model is trained with lower precision and the original FP32 parameters
                   would not fit in GPU memory.
-    keep_fp8_weight_transpose_cache: bool, default = 'True'
-                                     if set to `False`, it will not cache fp8 weight buffer instead of 
-                                     recomputing fp8 weight transpose. Recommend set to `False` when
-                                     enable FSDP parallel.
+    keep_fp8_weight_transpose_cache: bool, default = True
+                Controls whether to cache the FP8 weight transpose buffer during training.
+
+                - If set to `True` (default), the FP8 weight transpose buffer is cached to avoid recomputation, 
+                which can improve performance but significantly increases memory usage.
+                - If set to `False`, the buffer is not cached and the FP8 weight transpose is recomputed as needed. 
+                This reduces memory consumption, especially during checkpoint loading and runtime.
+
+                **Recommendation**: Set this to `False` when using Fully Sharded Data Parallel (FSDP) training. 
+                Caching FP8 weight transposes can double memory usage for modules such as `Linear`, 
+                `LayerNormLinear`, and `LayerNormMLP`, which may lead to excessive memory pressure and 
+                reduced efficiency of PyTorch's caching allocator.
+
+                Use this setting to balance memory usage and performance based on your training configuration.
                                      
     """
 
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -426,8 +426,8 @@ def forward(
             extra_output=rs_out,
         )
 
-        # Weight with column-wise usage is needed for dgrad GEMM.
-        if is_grad_enabled:
+        # Weight with column-wise usage is needed for dgrad GEMM while keeping fp8 weight transpose cache.
+        if is_grad_enabled and inp.requires_grad and keep_fp8_weight_transpose_cache:
             if isinstance(fc1_weight_final, QuantizedTensor):
                 fc1_weight_final.update_usage(columnwise_usage=True)
             if isinstance(fc2_weight_final, QuantizedTensor):
@@ -1219,10 +1219,20 @@ class LayerNormMLP(TransformerEngineBaseModule):
                      batch size per training step. Needed for JIT Warmup, a technique where jit
                      fused functions are warmed up before training to ensure same kernels are
                      used for forward propogation and activation recompute phase.
-    keep_fp8_weight_transpose_cache: bool, default = 'True'
-                                     if set to `False`, it will not cache fp8 weight buffer instead of 
-                                     recomputing fp8 weight transpose. Recommend set to `False` when
-                                     enable FSDP parallel.
+    keep_fp8_weight_transpose_cache: bool, default = True
+                Controls whether to cache the FP8 weight transpose buffer during training.
+
+                - If set to `True` (default), the FP8 weight transpose buffer is cached to avoid recomputation, 
+                which can improve performance but significantly increases memory usage.
+                - If set to `False`, the buffer is not cached and the FP8 weight transpose is recomputed as needed. 
+                This reduces memory consumption, especially during checkpoint loading and runtime.
+
+                **Recommendation**: Set this to `False` when using Fully Sharded Data Parallel (FSDP) training. 
+                Caching FP8 weight transposes can double memory usage for modules such as `Linear`, 
+                `LayerNormLinear`, and `LayerNormMLP`, which may lead to excessive memory pressure and 
+                reduced efficiency of PyTorch's caching allocator.
+
+                Use this setting to balance memory usage and performance based on your training configuration.
                                      
     """
 
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -1,3 +1,5 @@
+# This file was modified for portability to AMDGPU
+# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -282,8 +284,8 @@ def forward(
                         inputmat.update_usage(rowwise_usage=False, columnwise_usage=True)
                 saved_inputmat = inputmat
 
-            # Weight with column-wise usage is needed for dgrad GEMM.
-            if inp.requires_grad:
+            # Weight with column-wise usage is needed for dgrad GEMM while keeping fp8 weight transpose cache.
+            if inp.requires_grad and keep_fp8_weight_transpose_cache:
                 if isinstance(weightmat, QuantizedTensor):
                     weightmat.update_usage(columnwise_usage=True)
 
@@ -828,10 +830,20 @@ class Linear(TransformerEngineBaseModule):
                   it controls the type used to allocate the initial parameters. Useful when
                   the model is trained with lower precision and the original FP32 parameters
                   would not fit in GPU memory.
-    keep_fp8_weight_transpose_cache: bool, default = 'True'
-                                     if set to `False`, it will not cache fp8 weight buffer instead of 
-                                     recomputing fp8 weight transpose. Recommend set to `False` when
-                                     enable FSDP parallel.
+    keep_fp8_weight_transpose_cache: bool, default = True
+                Controls whether to cache the FP8 weight transpose buffer during training.
+
+                - If set to `True` (default), the FP8 weight transpose buffer is cached to avoid recomputation, 
+                which can improve performance but significantly increases memory usage.
+                - If set to `False`, the buffer is not cached and the FP8 weight transpose is recomputed as needed. 
+                This reduces memory consumption, especially during checkpoint loading and runtime.
+
+                **Recommendation**: Set this to `False` when using Fully Sharded Data Parallel (FSDP) training. 
+                Caching FP8 weight transposes can double memory usage for modules such as `Linear`, 
+                `LayerNormLinear`, and `LayerNormMLP`, which may lead to excessive memory pressure and 
+                reduced efficiency of PyTorch's caching allocator.
+
+                Use this setting to balance memory usage and performance based on your training configuration.
 
     """
 
diff --git a/transformer_engine/pytorch/ops/op.py b/transformer_engine/pytorch/ops/op.py
@@ -547,7 +547,10 @@ def to_cpu(src: torch.Tensor) -> torch.Tensor:
             # Get state for a given FP8 tensor
             if self.num_quantizers(mode) == 0:
                 continue
-            fp8_meta = self.get_fp8_meta(mode)
+            # Skip if op has no quantizer state
+            if self._fp8_metas is None or self._fp8_metas.get(mode, None) is None:
+                continue
+            fp8_meta = self._fp8_metas.get(mode, None)
             state[mode] = {}
 
             # Store tensors
@@ -603,7 +606,7 @@ def copy_tensor(src: torch.Tensor, dst: torch.Tensor) -> None:
                 continue
             if self.num_quantizers(mode) == 0:
                 continue
-            fp8_meta = self.get_fp8_meta(mode)
+            fp8_meta = self._fp8_metas.get(mode, None)
             if fp8_meta is None:
                 continue
 
@@ -617,7 +620,7 @@ def copy_tensor(src: torch.Tensor, dst: torch.Tensor) -> None:
                 del fp8_meta["global_fp8_buffer_pos_fwd_recompute"]
 
             # Load tensors
-            fp8_meta = self.get_fp8_meta(mode)
+            fp8_meta = self._fp8_metas.get(mode, None)
             if "scaling_fwd" in fp8_meta:
                 fp8_meta_fwd = fp8_meta["scaling_fwd"]
                 copy_tensor(state[mode]["scale_fwd"], fp8_meta_fwd.scale)

Original file line number	Diff line number	Diff line change
`@@ -1009,7 +1009,7 @@ def get_weight_workspace(`
`1009`	`1009`	`if update_workspace and quantizer is not None:`
`1010`	`1010`	`tensor.update_usage(`
`1011`	`1011`	`rowwise_usage=quantizer.rowwise_usage,`
`1012`		`- columnwise_usage=quantizer.columnwise_usage,`
	`1012`	`+ columnwise_usage=quantizer.columnwise_usage and create_transpose_cache,`
`1013`	`1013`	`)`
`1014`	`1014`	`return tensor`
`1015`	`1015`