13679 TE2.4 keep_fp8_transpose_cache refactor (#328)

sudhu2k · sudhu2k · web-flow · commit a6cf0d8c37f2 · 2025-10-07T22:47:28.000-05:00
* Initial commit

* Removed rocm_utils

* Added comment and bug fixes

* Grouped IS_HIP_EXTENSION with the property assignment

* Reverted transpose.cpp, removed keep_fp8_transpose_cache flag from grouped_linear, removed manual clearing of tensors in modules

* Aligning grouped_linear module with upstream

* Reverted tests to use _test_granular_accuracy_with_fp8 multiple times as needed

* Added comments back

* Moved comment to the test

---------

Co-authored-by: sudhu2k &lt;sugovind@amd.com&gt;
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -1330,11 +1330,20 @@ def test_fp8_linear_without_transpose_cache_accuracy(dtype, bs, model, fp8_model
             keep_fp8_weight_transpose_cache=True # defaults to True
         ).eval()
 
-    outputs = _test_granular_accuracy_with_fp8(layer, bs, dtype, config)
-    ref_outputs = _test_granular_accuracy_with_fp8(ref_layer, bs, dtype, config)
+    # The keep_fp8_transpose_cache flag will be evaluated over two iterations. 
+    # Given that the transpose operation's cache is invalidated during the backward pass,
+    # the objective of this test is to observe the subsequent forward pass behavior.
+    num_iterations = 2
+    all_outputs = []
+    all_ref_outputs = []
+    for _ in range(num_iterations):
+        outputs = _test_granular_accuracy_with_fp8(layer, bs, dtype, config)
+        ref_outputs = _test_granular_accuracy_with_fp8(ref_layer, bs, dtype, config)
+        all_outputs.append(outputs)
+        all_ref_outputs.append(ref_outputs)
 
     # Check output.
-    for te_output_no_cache, te_output_cache in zip(outputs, ref_outputs):
+    for te_output_no_cache, te_output_cache in zip(all_outputs, all_ref_outputs):
         assert_allclose(te_output_no_cache, te_output_cache, atol=0, rtol=0)
 
 @pytest.mark.parametrize("dtype", param_types)
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
@@ -1150,7 +1150,7 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None:
                     quantizer is not None
                 )  # to use primary fp8 weight one needs to use FP8 autocast with specific recipe.
                 quantizer.internal = False
-                if not self.keep_fp8_weight_transpose_cache:
+                if IS_HIP_EXTENSION and not self.keep_fp8_weight_transpose_cache:
                     quantizer.columnwise_usage=False
                 param = quantizer(param)
 
@@ -1201,7 +1201,6 @@ def get_weight_workspace(
         skip_update_flag: Optional[torch.Tensor] = None,
         fsdp_group: Optional[dist_group_type] = None,
         workspace_dtype: Optional[torch.dtype] = None,
-        create_transpose_cache: bool = True,
     ) -> QuantizedTensor:
         """Get FP8 workspace buffer and maybe update its values
 
@@ -1224,8 +1223,6 @@ def get_weight_workspace(
             over `update_workspace` if provided.
         fsdp_group: bool, default = None
             FSDP process group that the weights are distributed over.
-        create_transpose_cache: bool, default = True
-            Create transpose buffer from `tensor`.
         workspace_dtype: torch.dtype, default = None
             If weight workspace contains high-precision tensor - for example
             for debug quantization, this is dtype of the tensor.
@@ -1269,19 +1266,6 @@ def get_weight_workspace(
         ):
             _fsdp_gather_tensors(fsdp_group, [tensor.data.shape], out)
 
-        if not is_non_tn_fp8_gemm_supported() and not create_transpose_cache:
-            current_quantizer = None
-            if out is None:
-                current_quantizer = quantizer
-            else:
-                if hasattr(out, "quantize_"):
-                    current_quantizer = out._get_quantizer()
-                else:
-                    current_quantizer = quantizer
-                    
-            # NOTE: Not create transpose buffer internally.
-            current_quantizer.columnwise_usage = False
-
         # Construct workspace if needed
         if out is None:
             if tensor is None or quantizer is None:
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -501,7 +501,6 @@ def __init__(
         ub_overlap_ag: bool = False,
         ub_name: Optional[str] = None,
         delay_wgrad_compute: bool = False,
-        keep_fp8_weight_transpose_cache: bool = True,
     ) -> None:
         super().__init__()
 
@@ -516,8 +515,6 @@ def __init__(
         self.ub_overlap_rs = ub_overlap_rs
         self.ub_overlap_ag = ub_overlap_ag
         self.ub_name = ub_name
-        self.keep_fp8_weight_transpose_cache = keep_fp8_weight_transpose_cache
-
         assert (
             not ub_overlap_rs and not ub_overlap_ag
         ), "GroupedLinear doesn't support Userbuffer overlap."
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -81,8 +81,6 @@
     from ..triton_kernels.layernorm import te_layernorm_bwd_triton
     from ..triton_kernels.rmsnorm import te_rmsnorm_bwd_triton
 
-from ..rocm_utils import create_fp8_weight_transpose_cache, clear_fp8_weight_transpose_cache
-
 
 __all__ = ["LayerNormLinear"]
 
@@ -291,7 +289,7 @@ def forward(
 
             # Configure quantizer
             if weight_quantizer is not None:
-                weight_quantizer.set_usage(rowwise=True, columnwise=True)
+                weight_quantizer.set_usage(rowwise=True, columnwise=keep_fp8_weight_transpose_cache)
 
             # Get quantized weight
             update_workspace = is_first_microbatch is None or is_first_microbatch
@@ -303,7 +301,6 @@ def forward(
                 skip_update_flag=skip_fp8_weight_update,
                 fsdp_group=fsdp_group,
                 workspace_dtype=activation_dtype,
-                create_transpose_cache=keep_fp8_weight_transpose_cache,
             )
             weightmat.update_usage(rowwise_usage=True)
 
@@ -350,6 +347,8 @@ def forward(
         # Forward GEMM
         # Note: y = x * w^T
         # ------------------------------------------------------
+        if IS_HIP_EXTENSION and fp8 and not keep_fp8_weight_transpose_cache:
+            assert weightmat._transpose is None or weightmat._transpose.numel() == 0, "Expected _transpose to be None or an empty tensor when transpose cache is disabled."
         nvtx_range_push(f"{nvtx_label}.gemm")
         gemm_out, *_, reduce_scatter_out = general_gemm(
             weightmat,
@@ -701,8 +700,6 @@ def backward(
             if ctx.grad_input_quantizer is not None:
                 ctx.grad_input_quantizer.set_usage(rowwise=True, columnwise=False)
 
-            if ctx.fp8 and not ctx.keep_fp8_weight_transpose_cache:
-                create_fp8_weight_transpose_cache(weight)
 
             # Output buffers for Userbuffers reduce-scatter
             gemm_out = None
@@ -735,7 +732,7 @@ def backward(
             nvtx_range_pop(f"{nvtx_label}.dgrad_gemm")
 
             if ctx.fp8 and not ctx.keep_fp8_weight_transpose_cache:
-                clear_fp8_weight_transpose_cache(weight)
+                weight.update_usage(columnwise_usage=False)
 
             # Prepare grad input tensor
             # Note: Perform tensor-parallel communication
@@ -1195,7 +1192,7 @@ def __init__(
         self.name = name
         if TEDebugState.debug_enabled:
             self._turn_off_unsupported_features_in_debug()  # turn off userbuffers
-        self.keep_fp8_weight_transpose_cache = keep_fp8_weight_transpose_cache
+        self.keep_fp8_weight_transpose_cache = keep_fp8_weight_transpose_cache if IS_HIP_EXTENSION else True        
 
         if tp_group is None:
             self.tp_size = tp_size
@@ -1638,6 +1635,8 @@ def _get_quantizers(self, fp8_output, fp8_grad):
         input_quantizer.internal = True
         weight_quantizer = self.quantizers["scaling_fwd"][tex.FP8FwdTensors.GEMM1_WEIGHT]
         weight_quantizer.internal = True
+        if IS_HIP_EXTENSION:
+            weight_quantizer.set_usage(columnwise = self.keep_fp8_weight_transpose_cache)
         if fp8_output:
             output_quantizer = self.quantizers["scaling_fwd"][tex.FP8FwdTensors.GEMM1_OUTPUT]
         if torch.is_grad_enabled():
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -87,8 +87,6 @@
     from ..triton_kernels.layernorm import te_layernorm_bwd_triton
     from ..triton_kernels.rmsnorm import te_rmsnorm_bwd_triton
 
-from ..rocm_utils import create_fp8_weight_transpose_cache, clear_fp8_weight_transpose_cache
-
 __all__ = ["LayerNormMLP"]
 
 
@@ -347,8 +345,8 @@ def forward(
             # which handles weight caching etc.
             # FP8 cast to workspace buffer
             update_workspace = is_first_microbatch is None or is_first_microbatch
-            fc1_weight_quantizer.set_usage(rowwise=True, columnwise=True)
-            fc2_weight_quantizer.set_usage(rowwise=True, columnwise=True)
+            fc1_weight_quantizer.set_usage(rowwise=True, columnwise=keep_fp8_weight_transpose_cache)
+            fc2_weight_quantizer.set_usage(rowwise=True, columnwise=keep_fp8_weight_transpose_cache)
             fc1_weight_final = module.get_weight_workspace(
                 tensor=fc1_weight,
                 quantizer=fc1_weight_quantizer,
@@ -357,7 +355,6 @@ def forward(
                 skip_update_flag=skip_fp8_weight_update,
                 fsdp_group=fsdp_group,
                 workspace_dtype=activation_dtype,
-                create_transpose_cache=keep_fp8_weight_transpose_cache,
             )
             fc2_weight_final = module.get_weight_workspace(
                 tensor=fc2_weight,
@@ -367,7 +364,6 @@ def forward(
                 skip_update_flag=skip_fp8_weight_update,
                 fsdp_group=fsdp_group,
                 workspace_dtype=activation_dtype,
-                create_transpose_cache=keep_fp8_weight_transpose_cache,
             )
             fc1_weight_final.update_usage(rowwise_usage=True)
             fc2_weight_final.update_usage(rowwise_usage=True)
@@ -412,6 +408,10 @@ def forward(
                 gemm_gelu_fusion = False
         if debug:
             gemm_gelu_fusion = False
+        
+        if IS_HIP_EXTENSION and fp8 and not keep_fp8_weight_transpose_cache:
+            assert fc1_weight_final._transpose is None or fc1_weight_final._transpose.numel() == 0, "Expected _transpose to be None or an empty tensor when transpose cache is disabled."
+
         fc1_outputs = general_gemm(
             fc1_weight_final,
             ln_out_total,
@@ -482,6 +482,9 @@ def forward(
         # ------------------------------------------------------
         # FC2 GEMM
         # ------------------------------------------------------
+        if IS_HIP_EXTENSION and fp8 and not keep_fp8_weight_transpose_cache:
+            assert fc2_weight_final._transpose is None or fc2_weight_final._transpose.numel() == 0, "Expected _transpose to be None or an empty tensor when transpose cache is disabled."
+
         gemm_out, *_, reduce_scatter_out = general_gemm(
             fc2_weight_final,
             act_out,
@@ -817,12 +820,9 @@ def backward(
             if isinstance(grad_output, QuantizedTensorBase):
                 grad_output.update_usage(rowwise_usage=True)
             if ctx.fc2_weight_quantizer is not None and isinstance(
-                ctx.fc2_weight, QuantizedTensorBase
+                fc2_weight, QuantizedTensorBase
             ):
-                ctx.fc2_weight.update_usage(columnwise_usage=True)
-
-            if ctx.fp8 and not ctx.keep_fp8_weight_transpose_cache:
-                create_fp8_weight_transpose_cache(fc2_weight)
+                fc2_weight.update_usage(columnwise_usage=True)
 
             # Perform GEMM
             gemm_output, *_ = general_gemm(
@@ -853,7 +853,7 @@ def backward(
                 fc2_dgrad = gemm_output
 
             if ctx.fp8 and not ctx.keep_fp8_weight_transpose_cache:
-                clear_fp8_weight_transpose_cache(fc2_weight)
+                fc2_weight.update_usage(columnwise_usage=False)
 
             # --------------------------------------------------
             # Finished FC2 DGRAD...
@@ -1041,18 +1041,16 @@ def fc2_wgrad_gemm(
                     ub_obj_fc1_wgrad = get_ub("fc1_wgrad")
                     ub_type_fc1_wgrad = tex.CommOverlapType.RS
             
-            if ctx.fp8 and not ctx.keep_fp8_weight_transpose_cache:
-                create_fp8_weight_transpose_cache(fc1_weight)
 
             # --------------------------------------------------
             # FC1 DGRAD
             # --------------------------------------------------
 
             # Make sure required data is available
             if ctx.fc1_weight_quantizer is not None and isinstance(
-                ctx.fc1_weight_quantizer, QuantizedTensorBase
+                fc1_weight, QuantizedTensorBase
             ):
-                ctx.fc1_weight.update_usage(columnwise_usage=True)
+                fc1_weight.update_usage(columnwise_usage=True)
 
             # Output buffers for Userbuffers reduce-scatter
             gemm_out = None
@@ -1082,7 +1080,7 @@ def fc2_wgrad_gemm(
             )
 
             if ctx.fp8 and not ctx.keep_fp8_weight_transpose_cache:
-                clear_fp8_weight_transpose_cache(fc1_weight)
+                fc1_weight.update_usage(columnwise_usage=False)
 
             # Prepare grad input tensor
             # Note: Perform tensor-parallel communication
@@ -1552,7 +1550,7 @@ def __init__(
         self.set_parallel_mode = set_parallel_mode
         self.zero_centered_gamma = zero_centered_gamma
         self.symmetric_ar_type = symmetric_ar_type
-        self.keep_fp8_weight_transpose_cache = keep_fp8_weight_transpose_cache
+        self.keep_fp8_weight_transpose_cache = keep_fp8_weight_transpose_cache if IS_HIP_EXTENSION else True
 
         # GEMM-GELU fusion is currently only supported with split GEMM-AG overlap
         self.gemm_gelu_fusion = (
@@ -1918,6 +1916,8 @@ def _get_quantizers(self, fp8_output):
             fc1_input_quantizer.internal = True
             fc1_weight_quantizer = self.quantizers["scaling_fwd"][tex.FP8FwdTensors.GEMM1_WEIGHT]
             fc1_weight_quantizer.internal = True
+            if IS_HIP_EXTENSION:
+                fc1_weight_quantizer.set_usage(columnwise = self.keep_fp8_weight_transpose_cache)
             fc2_input_quantizer = self.quantizers["scaling_fwd"][tex.FP8FwdTensors.GEMM2_INPUT]
             fc2_input_quantizer.set_usage(
                 rowwise=True,
@@ -1926,6 +1926,8 @@ def _get_quantizers(self, fp8_output):
             fc1_input_quantizer.internal = True
             fc2_weight_quantizer = self.quantizers["scaling_fwd"][tex.FP8FwdTensors.GEMM2_WEIGHT]
             fc2_weight_quantizer.internal = True
+            if IS_HIP_EXTENSION:
+                fc2_weight_quantizer.set_usage(columnwise = self.keep_fp8_weight_transpose_cache)
             if fp8_output:
                 fc2_output_quantizer = self.quantizers["scaling_fwd"][
                     tex.FP8FwdTensors.GEMM2_OUTPUT
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -68,11 +68,11 @@
 from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer
 from ..tensor.mxfp8_tensor import MXFP8Quantizer
 from ..tensor._internal.mxfp8_tensor_base import MXFP8TensorBase
-from ..rocm_utils import create_fp8_weight_transpose_cache, clear_fp8_weight_transpose_cache
 from ..tensor.float8_blockwise_tensor import Float8BlockQuantizer
 from ..cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ...debug.pytorch.debug_state import TEDebugState
 from ...debug.pytorch.utils import any_feature_enabled
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 
 __all__ = ["Linear"]
 
@@ -228,8 +228,8 @@ def forward(
         if fp8 or debug:
             # Configure quantizer
             if weight_quantizer is not None:
-                columnwise_usage = is_grad_enabled and inp.requires_grad
-                if not columnwise_usage:
+                columnwise_usage = is_grad_enabled and inp.requires_grad and keep_fp8_weight_transpose_cache
+                if not columnwise_usage and keep_fp8_weight_transpose_cache:
                     columnwise_usage = (
                         is_fp8_activation_recompute_enabled()
                         and not in_fp8_activation_recompute_phase()
@@ -246,7 +246,6 @@ def forward(
                 skip_update_flag=skip_fp8_weight_update,
                 fsdp_group=fsdp_group,
                 workspace_dtype=activation_dtype,
-                create_transpose_cache=keep_fp8_weight_transpose_cache,
             )
             weightmat.update_usage(rowwise_usage=True)
 
@@ -293,6 +292,9 @@ def forward(
         # Forward GEMM
         # Note: y = x * w^T
         # ------------------------------------------------------
+        if IS_HIP_EXTENSION and fp8 and not keep_fp8_weight_transpose_cache:
+                assert weightmat._transpose is None or weightmat._transpose.numel() == 0, "Expected _transpose to be None or an empty tensor when transpose cache is disabled."
+
         nvtx_range_push(f"{nvtx_label}.gemm")
         gemm_out, *_, reduce_scatter_out = general_gemm(
             weightmat,
@@ -618,9 +620,6 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 if ctx.grad_input_quantizer is not None:
                     ctx.grad_input_quantizer.set_usage(rowwise=True, columnwise=False)
 
-                if ctx.fp8 and not ctx.keep_fp8_weight_transpose_cache:
-                    create_fp8_weight_transpose_cache(weight_fp8)
-
                 # Output buffers for Userbuffers reduce-scatter
                 gemm_out = None
                 reduce_scatter_out = None
@@ -652,7 +651,7 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 nvtx_range_pop(f"{nvtx_label}.dgrad_gemm")
 
                 if ctx.fp8 and not ctx.keep_fp8_weight_transpose_cache:
-                    clear_fp8_weight_transpose_cache(weight_fp8)
+                    weight_fp8.update_usage(columnwise_usage=False)
 
                 # Prepare grad input tensor
                 # Note: Perform tensor-parallel communication
@@ -1044,7 +1043,7 @@ def __init__(
             self._turn_off_unsupported_features_in_debug()  # turn off userbuffers
 
         self.wgrad_store = WeightGradStore(delay_wgrad_compute, ub_bulk_wgrad)
-        self.keep_fp8_weight_transpose_cache = keep_fp8_weight_transpose_cache
+        self.keep_fp8_weight_transpose_cache = keep_fp8_weight_transpose_cache if IS_HIP_EXTENSION else True
 
         if device == "meta":
             assert parameters_split is None, "Cannot split module parameters on 'meta' device."
@@ -1431,6 +1430,9 @@ def _get_quantizers(self, fp8_output, fp8_grad):
         input_quantizer.internal = True
         weight_quantizer = self.quantizers["scaling_fwd"][tex.FP8FwdTensors.GEMM1_WEIGHT]
         weight_quantizer.internal = True
+        if IS_HIP_EXTENSION:
+            weight_quantizer.set_usage(columnwise = self.keep_fp8_weight_transpose_cache)
+
         if fp8_output:
             output_quantizer = self.quantizers["scaling_fwd"][tex.FP8FwdTensors.GEMM1_OUTPUT]
         if torch.is_grad_enabled():
diff --git a/transformer_engine/pytorch/rocm_utils.py b/transformer_engine/pytorch/rocm_utils.py