[TE] Implement Triton current scaling (#341)

matthiasdiener · web-flow · commit 6bbd03cb5904 · 2025-11-17T20:57:53.000-06:00
diff --git a/ci/pytorch.sh b/ci/pytorch.sh
@@ -81,6 +81,7 @@ run_test_config(){
     run_default_fa 1 test_parallel_cross_entropy.py
     NVTE_USE_DEQUANTIZE_TRITON=1 NVTE_USE_CAST_TRANSPOSE_TRITON=1 NVTE_USE_RMSNORM_TRITON=1 NVTE_USE_LAYERNORM_TRITON=1 run_default_fa_lbl "triton" 1 test_numerics.py
     NVTE_USE_RMSNORM_TRITON=1 run_default_fa_lbl "triton" 1 test_fusible_ops.py
+    NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa_lbl "triton" 1 test_float8_current_scaling_exact.py
 }
 
 run_test_config_mgpu(){
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -737,10 +737,6 @@ def test_gpt_full_activation_recompute(
         pytest.skip(reason_for_no_fp8)
     if recipe.mxfp8() and not mxfp8_available:
         pytest.skip(reason_for_no_mxfp8)
-    if IS_HIP_EXTENSION:
-        use_cast_transpose_triton =  bool( int(os.environ.get('NVTE_USE_CAST_TRANSPOSE_TRITON', '0')) )
-        if fp8 and recipe.float8_current_scaling() and use_cast_transpose_triton:
-            pytest.skip("Float8 Current Scaling unsupported for full recompute.")
     if recipe.float8_block_scaling() and not fp8_block_scaling_available:
         pytest.skip(reason_for_no_fp8_block_scaling)
 
@@ -1959,9 +1955,6 @@ def test_grouped_linear_accuracy(
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and not fp8:
             pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
-        use_cast_transpose_triton =  bool( int(os.environ.get('NVTE_USE_CAST_TRANSPOSE_TRITON', '0')) )
-        if fp8 and recipe.float8_current_scaling() and use_cast_transpose_triton:
-            pytest.skip("Float8 Current Scaling unsupported for grouped linear accuracy.")
     if fp8 and not fp8_available:
         pytest.skip(reason_for_no_fp8)
     if fp8 and recipe.mxfp8() and not mxfp8_available:
diff --git a/tests/pytorch/triton_kernels/test_cast.py b/tests/pytorch/triton_kernels/test_cast.py
@@ -1,16 +1,20 @@
 # Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 # License for AMD contributions = MIT. See LICENSE for more information
 
-import os
 import pytest
 import torch
 
 from transformer_engine.pytorch.triton_kernels.cast import te_quantize_triton
-from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer
+from transformer_engine.pytorch.triton_kernels.cast_transpose import _compute_scale_from_amax_triton
+from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer, Float8CurrentScalingQuantizer
 from transformer_engine.pytorch.triton_kernels.common import te_dtype_to_torch_dtype
 import transformer_engine_torch as tex
 from test_common import te_compare_results, fill_uniform, get_tolerances
+from transformer_engine.pytorch.fp8 import fp8_autocast
+from transformer_engine.common import recipe
+from transformer_engine.pytorch.utils import get_torch_float8_e4m3_type, get_torch_float8_e5m2_type
 
+@pytest.mark.parametrize("scaling", ("delayed", "current"))
 @pytest.mark.parametrize("shape", 
                          [
                         (16 ),
@@ -32,17 +36,30 @@
                         ])
 @pytest.mark.parametrize("in_dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("out_dtype", [tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2])
-def test_quantize(shape, in_dtype, out_dtype):
+def test_quantize(scaling, shape, in_dtype, out_dtype):
     input_tensor = fill_uniform(shape, dtype=in_dtype)
 
-    scale_tensor = torch.rand(1, dtype=torch.float32, device='cuda') * 3.0 - 2.0
-    amax_tensor = torch.zeros(1, dtype=torch.float32, device='cuda')
-    triton_quantizer = Float8Quantizer(scale=scale_tensor, amax=amax_tensor, fp8_dtype=out_dtype)
+    if scaling == "current":
+        triton_quantizer = Float8CurrentScalingQuantizer(fp8_dtype=out_dtype, device="cuda")
+        tex_quantizer = Float8CurrentScalingQuantizer(fp8_dtype=out_dtype, device="cuda")
+
+        with fp8_autocast(enabled=True, fp8_recipe=recipe.Float8CurrentScaling()):
+            quantized_out_triton = te_quantize_triton(input_tensor, quantizer=triton_quantizer)
+            quantized_out_tex    = tex.quantize(input_tensor, tex_quantizer)
+
+    elif scaling == "delayed":
+        scale_tensor = torch.rand(1, dtype=torch.float32, device='cuda') * 3.0 - 2.0
+        amax_tensor = torch.zeros(1, dtype=torch.float32, device='cuda')
+
+        triton_quantizer = Float8Quantizer(scale=scale_tensor, amax=amax_tensor, fp8_dtype=out_dtype)
+        tex_quantizer = Float8Quantizer(scale=scale_tensor, amax=amax_tensor, fp8_dtype=out_dtype)
+
+        quantized_out_triton  = te_quantize_triton(input_tensor, quantizer=triton_quantizer)
+        quantized_out_tex = tex.quantize(input_tensor, tex_quantizer)
+
+    else:
+        raise ValueError(f"unknown scaling method {scaling}")
 
-    quantized_out_triton  = te_quantize_triton(input_tensor, quantizer=triton_quantizer)
-    
-    tex_quantizer = Float8Quantizer(scale=scale_tensor, amax=amax_tensor, fp8_dtype=out_dtype)
-    quantized_out_tex = tex.quantize(input_tensor, tex_quantizer)
     torch_out_dtype = te_dtype_to_torch_dtype(out_dtype)
     
     atol_q, rtol_q = get_tolerances(torch_out_dtype)
@@ -112,3 +129,41 @@ def test_quantize_bad_transpose(t_shape, fp8_dtype):
     quantized_output._transpose = torch.empty(t_shape, device='cuda')
 
     te_quantize_triton(input_tensor, quantizer=quantizer, output=quantized_output)
+
+
+@pytest.mark.parametrize("amax_val", (0.0, float('nan'), float('inf'), -float('inf'), 1.0, 1e-8, 123.456))
+@pytest.mark.parametrize("force_pow_2_scales", (False, True))
+@pytest.mark.parametrize("epsilon", (0.0, 1e-3, 100.0))
+@pytest.mark.parametrize("fp8_dtype", (get_torch_float8_e4m3_type(), get_torch_float8_e5m2_type()))
+def test_compute_scale_from_amax(amax_val, force_pow_2_scales, epsilon, fp8_dtype):
+    max_fp8 = torch.finfo(fp8_dtype).max
+    value_for_inf = float(torch.finfo(torch.float32).max)
+
+    amax_list = [torch.tensor(amax_val, dtype=torch.float32, device="cuda")]
+
+    # TEX path - TEX expects lists for (amaxes, scales, inv_scales)
+    scale_ref = [torch.empty((), dtype=torch.float32, device="cuda")]
+    scale_inv_ref = [torch.empty((), dtype=torch.float32, device="cuda")]
+
+    chunk_size = 2048 * 32  # arbitrary
+    overflow_buf = torch.zeros(1, dtype=torch.int32, device="cuda")
+    tex.multi_tensor_compute_scale_and_scale_inv(
+        chunk_size,
+        overflow_buf,
+        [amax_list, scale_ref, scale_inv_ref],
+        max_fp8,
+        force_pow_2_scales,
+        epsilon,
+    )
+
+    # Triton path & comparison
+    scale_triton = torch.empty((), dtype=torch.float32, device="cuda")
+    scale_inv_triton   = torch.empty((), dtype=torch.float32, device="cuda")
+    _compute_scale_from_amax_triton[(1,)](
+        amax_list[0], scale_triton, scale_inv_triton,
+        float(max_fp8), float(epsilon), float(value_for_inf),
+        FORCE_POW_2_SCALES=force_pow_2_scales,
+    )
+
+    torch.testing.assert_close(scale_triton, scale_ref[0], rtol=0.0, atol=0.0)
+    torch.testing.assert_close(scale_inv_triton, scale_inv_ref[0], rtol=0.0, atol=0.0)
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -1,3 +1,5 @@
+# This file was modified for portability to AMDGPU
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -8,6 +10,7 @@
 
 import functools
 import torch
+import os
 
 import transformer_engine_torch as tex
 
@@ -49,6 +52,7 @@
     prepare_for_saving,
     restore_from_saved,
 )
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 
 __all__ = ["GroupedLinear"]
 
@@ -125,9 +129,20 @@ def forward(
             recipe = FP8GlobalStateManager.get_fp8_recipe()
             if hasattr(recipe, "fp8_gemm_fprop"):
                 fprop_gemm_use_split_accumulator = recipe.fp8_gemm_fprop.use_split_accumulator
-            inputmats = tex.fused_multi_quantize(
-                inputmats_no_fp8, None, input_quantizers, TE_DType[activation_dtype]
-            )
+
+            if IS_HIP_EXTENSION and bool( int(os.environ.get('NVTE_USE_CAST_TRANSPOSE_TRITON', '0')) ):
+                # The Triton path has no equivalent for tex.fused_multi_quantize()
+                inputmats = []
+                for i, x in enumerate(inputmats_no_fp8):
+                    qi = input_quantizers[i]
+                    dst = qi.make_empty(x.shape, dtype=x.dtype, device=x.device, requires_grad=False)
+                    qi.update_quantized(x, dst, noop_flag=None)
+                    inputmats.append(dst)
+            else:
+                inputmats = tex.fused_multi_quantize(
+                    inputmats_no_fp8, None, input_quantizers, TE_DType[activation_dtype]
+                )
+
             weights_fp8 = []
             bias_dtype = torch.bfloat16 if activation_dtype == torch.float32 else activation_dtype
             # FP8 cast to workspace buffer
diff --git a/transformer_engine/pytorch/tensor/float8_tensor.py b/transformer_engine/pytorch/tensor/float8_tensor.py
@@ -247,7 +247,12 @@ def update_quantized(
             src = src.contiguous()
 
         # Launch cast kernel
-        tex.quantize(src, self, dst, noop_flag)
+        if IS_HIP_EXTENSION:
+            use_cast_transpose_triton =  bool( int(os.environ.get('NVTE_USE_CAST_TRANSPOSE_TRITON', '0')) )
+            quantize_func = te_quantize_triton if use_cast_transpose_triton else tex.quantize
+            quantize_func(src, self, dst, noop_flag)
+        else:
+            tex.quantize(src, self, dst, noop_flag)
 
         # Update FP8 dtype
         dst._fp8_dtype = self.dtype
diff --git a/transformer_engine/pytorch/triton_kernels/cast.py b/transformer_engine/pytorch/triton_kernels/cast.py
@@ -96,6 +96,10 @@ def te_quantize_triton(
                 cast_out = out._data
                 trans_out = out._transpose
                 scale_inv_out = out._scale_inv
+
+                from ..tensor.float8_tensor import Float8CurrentScalingQuantizer
+                is_current_scaling = isinstance(quantizer, Float8CurrentScalingQuantizer)
+
                 te_cast_transpose_noop_triton(
                     input_tensor,
                     noop_flag,
@@ -104,7 +108,10 @@ def te_quantize_triton(
                     trans_out=trans_out,
                     amax_out=amax_out,
                     scale_inv_out=scale_inv_out,
-                    otype=otype
+                    otype=otype,
+                    current_scaling=is_current_scaling,
+                    eps = getattr(quantizer, "amax_epsilon", 0.0),
+                    force_pow_2_scales = getattr(quantizer, "force_pow_2_scales", False),
                 )
                 
             else:
diff --git a/transformer_engine/pytorch/triton_kernels/cast_transpose.py b/transformer_engine/pytorch/triton_kernels/cast_transpose.py
@@ -16,6 +16,80 @@
 #### cast_transpose
 ##########################################
 
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_M': 64,  'BLOCK_N': 64,  'GROUP_M': 1}, num_warps=4),
+        triton.Config({'BLOCK_M': 64,  'BLOCK_N': 64,  'GROUP_M': 8}, num_warps=4),
+        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=8),
+    ],
+    key=['M', 'N'],
+)
+@triton.jit
+def _amax_reduce_triton(
+    A,
+    stride_am, stride_an,
+    M, N,
+    amax_ptr,                 # float32[1], initialize to -inf on host
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    GROUP_M: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    width = GROUP_M * grid_n
+    group_id   = pid // width
+    group_size = tl.minimum(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // group_size
+
+    rm = pid_m.to(tl.int64) * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n.to(tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    A_ptrs = A + rm[:, None] * stride_am + rn[None, :] * stride_an
+    mask = (rm < M)[:, None] & (rn < N)[None, :]
+
+    a = tl.load(A_ptrs, mask=mask, other=0).to(tl.float32)
+    tile_amax = tl.max(tl.abs(a))
+    # accumulate tile-wise max into global amax
+    tl.atomic_max(amax_ptr, tile_amax, sem='relaxed')
+
+
+@triton.jit
+def _compute_scale_from_amax_triton(
+    amax_ptr,
+    scale_ptr,
+    inv_ptr,
+    max_fp8,
+    epsilon,
+    value_for_inf,
+    FORCE_POW_2_SCALES: tl.constexpr,
+):
+    # This implementation mimics transformer_engine::compute_scale_from_amax()
+
+    a = tl.load(amax_ptr).to(tl.float32)
+
+    # amax < epsilon -> epsilon (NaNs pass through)
+    a = tl.where(a < epsilon, epsilon, a)
+
+    # bad amax (NaN, inf, 0.0) -> scale = 1.0
+    bad = (a != a) | (tl.abs(a) == float('inf')) | (a == 0.0)
+
+    if bad:
+        s = tl.full((), 1.0, tl.float32)
+    else:
+        s = max_fp8 / a
+        # inf -> scale = value_for_inf
+        s = tl.where(tl.abs(a) == float('inf'), value_for_inf, s)
+        if FORCE_POW_2_SCALES:
+            s = tl.math.exp2(tl.floor(tl.log2(s)))
+
+    tl.store(scale_ptr, s)
+    tl.store(inv_ptr, 1.0 / s)
+
+
 @triton.autotune(
         configs=[
         triton.Config({'BLOCK_M': 64, 'BLOCK_N': 64, 'GROUP_M': 1}, num_warps=4),
@@ -69,6 +143,52 @@ def _cast_transpose_triton(A, noop_ptr, C, T, stride_am, stride_an, stride_bn, s
         scale_inv_out = tl.fdiv(1.0, scale)
         tl.store(scale_inv_ptr, scale_inv_out)
 
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_M': 64,  'BLOCK_N': 64,  'GROUP_M': 1}, num_warps=4),
+        triton.Config({'BLOCK_M': 64,  'BLOCK_N': 64,  'GROUP_M': 8}, num_warps=4),
+        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=8),
+    ],
+    key=['M', 'N']
+)
+@triton.jit
+def _cast_transpose_triton_current_scaling(A, C, T, stride_am, stride_an, stride_bn, stride_bm, M, N, scale_ptr, max_fp8: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, GROUP_M: tl.constexpr):
+    # Similar (but slightly optimized) version of the delayed scaling kernel
+    # implemented in _cast_transpose_triton().
+    pid = tl.program_id(0)
+    scale = tl.load(scale_ptr)
+
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // group_size
+
+    rm = pid_m.to(tl.int64) * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n.to(tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    A = A + rm[:, None] * stride_am + rn[None, :] * stride_an
+    mask = (rm < M)[:, None] & (rn < N)[None, :]
+    a = tl.load(A, mask=mask)
+    a = a.to(tl.float32)
+
+    scaled_a = a * scale
+    scaled_a = tl.clamp(scaled_a, -max_fp8, max_fp8)
+    fp8_a = scaled_a.to(C.type.element_ty)
+    C = C + rm[:, None] * stride_am + rn[None, :] * stride_an
+    tl.store(C, fp8_a, mask=mask)
+
+    # rematerialize to save registers
+    rm = pid_m.to(tl.int64) * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n.to(tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    T = T + rm[:, None] * stride_bm + rn[None, :] * stride_bn
+    mask = (rm < M)[:, None] & (rn < N)[None, :]
+    tl.store(T, fp8_a, mask=mask)
+
+
 FP32_EXPONENT_BIAS = tl.constexpr(127)
 FP32_MANTISSA_BITS = tl.constexpr(23)
 @triton.jit
@@ -232,7 +352,7 @@ def _dequantize_mxfp8_triton(
 
 # Reshapes input of any given shape to 2D for processing, 
 # then uses the Triton kernel to perform casting and transposition efficiently.
-def te_cast_transpose_noop_triton(input, noop_flag, input_scale, cast_out, trans_out, amax_out, scale_inv_out, otype):
+def te_cast_transpose_noop_triton(input, noop_flag, input_scale, cast_out, trans_out, amax_out, scale_inv_out, otype, current_scaling, eps, force_pow_2_scales):
 
     row_length = input.shape[-1] if len(input.shape) > 0 else 1
     num_rows = input.numel() // row_length
@@ -254,7 +374,35 @@ def te_cast_transpose_noop_triton(input, noop_flag, input_scale, cast_out, trans
         use_noop = False
     
     grid = lambda META: (triton.cdiv(num_rows, META['BLOCK_M']) * triton.cdiv(row_length, META['BLOCK_N']),)
-    _cast_transpose_triton[grid](input_2d_view, noop_flag, triton.reinterpret(cast_out_2d_view, tl_dtype), triton.reinterpret(trans_out_2d_view, tl_dtype), input_stride_M, input_stride_N, trans_out_stride_M, trans_out_stride_N, num_rows, row_length, input_scale, amax_out, scale_inv_out, get_fp8_max(otype), use_noop)
+
+    if current_scaling:
+        # Current scaling:
+        #   1) global amax reduction
+        #   2) compute current scale
+        #   3) cast+transpose with that current scale (otherwise same as delayed)
+
+        # global amax
+        amax_out.fill_(-float("inf"))
+        _amax_reduce_triton[grid](
+            input_2d_view,
+            input_stride_M, input_stride_N,
+            num_rows, row_length,
+            amax_out,
+        )
+
+        # Compute scale
+        fp8_max = get_fp8_max(otype)
+
+        _compute_scale_from_amax_triton[(1,)](
+            amax_out, input_scale, scale_inv_out,
+            fp8_max, eps, torch.finfo(torch.float32).max,
+            FORCE_POW_2_SCALES=force_pow_2_scales,
+        )
+
+        _cast_transpose_triton_current_scaling[grid](input_2d_view, triton.reinterpret(cast_out_2d_view, tl_dtype), triton.reinterpret(trans_out_2d_view, tl_dtype), input_stride_M, input_stride_N, trans_out_stride_M, trans_out_stride_N, num_rows, row_length, input_scale, get_fp8_max(otype))
+    else:
+        # Delayed scaling
+        _cast_transpose_triton[grid](input_2d_view, noop_flag, triton.reinterpret(cast_out_2d_view, tl_dtype), triton.reinterpret(trans_out_2d_view, tl_dtype), input_stride_M, input_stride_N, trans_out_stride_M, trans_out_stride_N, num_rows, row_length, input_scale, amax_out, scale_inv_out, get_fp8_max(otype), use_noop)
 
 def te_cast_transpose_mxfp8_triton(input, out, noop_flag=None):
     row_length = input.shape[-1] if len(input.shape) > 0 else 1

Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,7 @@ run_test_config(){`
`81`	`81`	`run_default_fa 1 test_parallel_cross_entropy.py`
`82`	`82`	`NVTE_USE_DEQUANTIZE_TRITON=1 NVTE_USE_CAST_TRANSPOSE_TRITON=1 NVTE_USE_RMSNORM_TRITON=1 NVTE_USE_LAYERNORM_TRITON=1 run_default_fa_lbl "triton" 1 test_numerics.py`
`83`	`83`	`NVTE_USE_RMSNORM_TRITON=1 run_default_fa_lbl "triton" 1 test_fusible_ops.py`
	`84`	`+ NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa_lbl "triton" 1 test_float8_current_scaling_exact.py`
`84`	`85`	`}`
`85`	`86`
`86`	`87`	`run_test_config_mgpu(){`