ROCm · matthiasdiener · Dec 3, 2025 · Nov 12, 2025 · Nov 13, 2025 · Nov 13, 2025
@@ -1,4 +1,6 @@
 /*************************************************************************
+ * This file was modified for portability to AMDGPU
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -196,6 +198,41 @@ TEST_P(CastCSTestSuite, TestCastCS) {
 }
 
 
+TEST(AmaxConsistencyTest, AtomicVsWorkspace) {
+  using namespace transformer_engine;
+  using namespace test;
+
+  std::vector<size_t> shape{256, 1024};
+  const size_t N = product(shape);
+
+  // Input: FP32, Output: FP8 (E4M3) with per-tensor scaling
+  Tensor input("input", shape, DType::kFloat32);
+  Tensor out_atomic("out_atomic", shape, DType::kFloat8E4M3, true, false);
+  Tensor out_ws("out_ws", shape, DType::kFloat8E4M3, true, false);
+
+  fillUniform(&input);
+
+  // Path 1: atomic-based amax (no workspace)
+  nvte_compute_amax(input.data(), out_atomic.data(), 0);
+
+  // Path 2: two-stage amax using workspace
+  // Use a workspace capacity >= number of blocks
+  std::vector<size_t> ws_shape{N};
+  Tensor workspace("workspace", ws_shape, DType::kFloat32);
+  nvte_compute_amax_with_workspace(input.data(), out_ws.data(), workspace.data(), 0);
+
+  cudaDeviceSynchronize();
+  auto err = cudaGetLastError();
+  ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
+
+  // Compare the resulting amax values
+  float amax_atomic = out_atomic.amax();
+  float amax_ws     = out_ws.amax();
+
+  compareResults("amax_consistency", amax_atomic, amax_ws, /*atol=*/0.0f, /*rtol=*/0.0f);
+}
+
+
 
 INSTANTIATE_TEST_SUITE_P(
   OperatorTest,

@@ -1,4 +1,6 @@
 /*************************************************************************
+ * This file was modified for portability to AMDGPU
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -73,6 +75,20 @@ void nvte_delayed_scaling_recipe_amax_and_scale_update_after_reduction(
     std::vector<NVTETensor> scales, const char* amax_compute_algo, NVTEDType fp8_dtype,
     float margin, cudaStream_t stream);
 
+constexpr int amax_kernel_threads = 512;
+
+inline bool nvte_use_atomic_amax() {
+  static int cached = -1;
+  if (cached == -1) {
+    cached = 0;
+    const char *env_p = std::getenv("NVTE_USE_ATOMIC_AMAX");
+    if (env_p && std::string(env_p) == "1") {
+      cached = 1;
+    }
+  }
+  return cached == 1;
+}
+
 /*! \brief Compute an FP8 tensor's amax.
  *
  *  The amax (maximum absolute value) of the input tensor is computed
@@ -84,6 +100,8 @@ void nvte_delayed_scaling_recipe_amax_and_scale_update_after_reduction(
  */
 void nvte_compute_amax(const NVTETensor input, NVTETensor output, cudaStream_t stream);
 
+void nvte_compute_amax_with_workspace(const NVTETensor input_, const NVTETensor output_, const NVTETensor workspace_, cudaStream_t stream);
+
 /*! \brief Update an FP8 tensor's scale based on its amax.
  *
  *  This is only supported for FP8 tensors with per-tensor scaling.
@@ -93,6 +111,7 @@ void nvte_compute_amax(const NVTETensor input, NVTETensor output, cudaStream_t s
  *  \param[in]     config           Quantization configuration.
  *  \param[in]     stream           CUDA stream used for the operation.
  */
+
 void nvte_compute_scale_from_amax(NVTETensor output, const NVTEQuantizationConfig config,
                                   cudaStream_t stream);
 

@@ -26,11 +26,29 @@ using bf16__ = __nv_bfloat16;
 using bf16__ = __hip_bfloat16;
 #endif //__HIP_PLATFORM_AMD__
 
-constexpr int amax_kernel_threads = 512;
+
+template <int BLOCK_THREADS>
+__global__ void amax_final_reduce(const float* __restrict__ block_amax,
+                                  float* __restrict__ global_amax,
+                                  int num_blocks) {
+  float val = 0.f;
+
+  for (int i = threadIdx.x; i < num_blocks; i += BLOCK_THREADS) {
+    val = fmaxf(val, block_amax[i]);
+  }
+
+  const int warp_id = threadIdx.x / THREADS_PER_WARP;
+  const float block_max =
+      reduce_max<BLOCK_THREADS / THREADS_PER_WARP>(val, warp_id);
+
+  if (threadIdx.x == 0) {
+    *global_amax = block_max;
+  }
+}
 
 template <int nvec, bool aligned, typename InputType>
 __launch_bounds__(amax_kernel_threads) __global__
-    void amax_kernel(const InputType *input, float *amax, const size_t N,
+    void amax_kernel(const InputType *input, float *amax, float* __restrict__ block_amax, const size_t N,
                      const size_t num_aligned_elements) {
   VectorizedLoader<InputType, nvec, aligned> loader(input, N);
   InputType max{0.f};
@@ -65,12 +83,19 @@ __launch_bounds__(amax_kernel_threads) __global__
   // Reduce amax over block
   max = reduce_max<amax_kernel_threads / THREADS_PER_WARP>(max, warp_id);
   if (threadIdx.x == 0) {
-    atomicMaxFloat(amax, max);
+    if (block_amax != nullptr) {
+      // 2-stage: write per-block result
+      block_amax[blockIdx.x] = max;
+    } else {
+      // Atomic path: directly update global amax
+      atomicMaxFloat(amax, max);
+    }
   }
 }
 
 template <int nvec, typename InputType>
-void launch_amax_kernel(const InputType *input, float *amax, const size_t N, cudaStream_t stream) {
+void launch_amax_kernel(const InputType *input, float *amax, const size_t N, float *block_amax,
+                        size_t block_capacity, cudaStream_t stream) {
   // Zero out amax so we can update with atomic max
   (void)cudaMemsetAsync(amax, 0, sizeof(float), stream);
 
@@ -89,24 +114,38 @@ void launch_amax_kernel(const InputType *input, float *amax, const size_t N, cud
   constexpr size_t max_blocks = 65535;
   num_blocks = std::min(num_blocks, max_blocks);
 
+  const bool UseBlockAmax =
+      (block_amax != nullptr) &&
+      (block_capacity >= num_blocks) &&
+      !nvte_use_atomic_amax();
+
   // Launch kernel
   switch (align) {
     case Alignment::SAME_ALIGNED:
       amax_kernel<nvec, true, InputType>
-          <<<num_blocks, threads, 0, stream>>>(input, amax, N, num_aligned_elements);
+          <<<num_blocks, threads, 0, stream>>>(input, amax, block_amax, N, num_aligned_elements);
       break;
     case Alignment::SAME_UNALIGNED:
       amax_kernel<nvec, false, InputType>
-          <<<num_blocks, threads, 0, stream>>>(input, amax, N, num_aligned_elements);
+          <<<num_blocks, threads, 0, stream>>>(input, amax, block_amax, N, num_aligned_elements);
       break;
     case Alignment::DIFFERENT: {
       // This case is a logic error, since there is only one pointer (input)
       // in the alignment check. Still safe to process without vectorization.
-      amax_kernel<1, true, InputType><<<num_blocks, threads, 0, stream>>>(input, amax, N, N);
+      amax_kernel<1, true, InputType><<<num_blocks, threads, 0, stream>>>(input, amax, block_amax, N, N);
       break;
     }
   }
 
+  if (UseBlockAmax) {
+    constexpr int FINAL_REDUCE_THREADS = 256;
+    dim3 fr_block(FINAL_REDUCE_THREADS);
+    dim3 fr_grid(1);
+
+    amax_final_reduce<FINAL_REDUCE_THREADS>
+        <<<fr_grid, fr_block, 0, stream>>>(block_amax, amax, static_cast<int>(num_blocks));
+  }
+
   // Check results
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
@@ -115,6 +154,10 @@ void launch_amax_kernel(const InputType *input, float *amax, const size_t N, cud
 }  // namespace transformer_engine
 
 void nvte_compute_amax(const NVTETensor input_, const NVTETensor output_, cudaStream_t stream) {
+  nvte_compute_amax_with_workspace(input_, output_, /*workspace=*/nullptr, stream);
+}
+
+void nvte_compute_amax_with_workspace(const NVTETensor input_, const NVTETensor output_, const NVTETensor workspace_, cudaStream_t stream) {
   NVTE_API_CALL(nvte_compute_amax);
   using namespace transformer_engine;
 
@@ -150,11 +193,27 @@ void nvte_compute_amax(const NVTETensor input_, const NVTETensor output_, cudaSt
              to_string(output.amax.dtype), ")");
   CheckOutputTensor(output, "output_compute_amax", true);
 
+  // Optional workspace
+  float* block_amax = nullptr;
+  size_t block_capacity = 0;
+
+  if (workspace_ != nullptr) {
+    auto &workspace = *reinterpret_cast<Tensor *>(workspace_);
+    NVTE_CHECK(workspace.data.dptr != nullptr,
+               "Workspace tensor for amax computation has no data");
+    NVTE_CHECK(workspace.data.dtype == DType::kFloat32,
+               "Workspace tensor for amax computation must be FP32, got dtype=",
+               to_string(workspace.data.dtype));
+    block_amax     = reinterpret_cast<float*>(workspace.data.dptr);
+    block_capacity = workspace.data.numel();
+  }
+
   // Compute amax
   TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
       input.data.dtype, IType, constexpr int nvec = 32 / sizeof(IType);
       launch_amax_kernel<nvec>(reinterpret_cast<const IType *>(input.data.dptr),
-                               reinterpret_cast<float *>(output.amax.dptr), input.data.numel(),
+                               reinterpret_cast<float *>(output.amax.dptr), input.data.numel(), block_amax,
+          block_capacity,
                                stream););  // NOLINT(*)
 }
 

@@ -1,4 +1,6 @@
 /*************************************************************************
+ * This file was modified for portability to AMDGPU
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -36,11 +38,33 @@ py::object activation_helper(const at::Tensor& input, py::handle quantizer, int
     auto [te_output_act, out_act] =
         my_quantizer_none->create_tensor(input_shape, GetTransformerEngineDType(fake_tensor_type));
 
-    NVTE_SCOPED_GIL_RELEASE({
-      act_func(te_input.data(), te_output_act.data(), at::cuda::getCurrentCUDAStream());
-      // use te_output_act as input to the compute amax and find the amax of activated tensor
-      nvte_compute_amax(te_output_act.data(), te_output.data(), at::cuda::getCurrentCUDAStream());
-    });
+    if (nvte_use_atomic_amax()) {
+       NVTE_SCOPED_GIL_RELEASE({
+        act_func(te_input.data(), te_output_act.data(), at::cuda::getCurrentCUDAStream());
+        // use te_output_act as input to the compute amax and find the amax of activated tensor
+        nvte_compute_amax(te_output_act.data(), te_output.data(),
+                          at::cuda::getCurrentCUDAStream());
+      });
+    } else {
+      // Workspace for nvte_compute_amax_with_workspace
+      const auto N = static_cast<size_t>(input_tensor.numel());
+      constexpr size_t max_blocks_hw = 65535;
+
+      // Worst-case (nvec = 1) upper bound on number of blocks
+      size_t max_blocks = std::min(DIVUP(N, static_cast<size_t>(amax_kernel_threads)), max_blocks_hw);
+
+      // Allocate FP32 workspace for block-wise amax
+      auto ws = at::empty({static_cast<long>(max_blocks)}, at::CUDA(at::kFloat));
+
+      TensorWrapper te_workspace = makeTransformerEngineTensor(ws);
+
+      NVTE_SCOPED_GIL_RELEASE({
+        act_func(te_input.data(), te_output_act.data(), at::cuda::getCurrentCUDAStream());
+        // use te_output_act as input to the compute amax and find the amax of activated tensor
+        nvte_compute_amax_with_workspace(te_output_act.data(), te_output.data(),
+                          te_workspace.data(), at::cuda::getCurrentCUDAStream());
+      });
+    }
 
     // my_quantizer here has to be a Float8CurrentScalingQuantizer
     auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer*>(my_quantizer.get());

@@ -1,4 +1,6 @@
 /*************************************************************************
+ * This file was modified for portability to AMDGPU
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -48,9 +50,30 @@ std::vector<py::object> bgrad_quantize(const at::Tensor& input, py::handle py_qu
   if (detail::IsFloat8CurrentScalingQuantizers(py_quantizer.ptr())) {
     // my_quantizer here has to be a Float8CurrentScalingQuantizer
     auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer*>(quantizer.get());
-    NVTE_SCOPED_GIL_RELEASE({
-      nvte_compute_amax(input_tensor.data(), out_tensor.data(), at::cuda::getCurrentCUDAStream());
-    });
+
+    if (nvte_use_atomic_amax()) {
+      NVTE_SCOPED_GIL_RELEASE({
+        nvte_compute_amax(input_tensor.data(), out_tensor.data(), at::cuda::getCurrentCUDAStream());
+      });
+    } else {
+      // Workspace for nvte_compute_amax_with_workspace
+      const auto N = static_cast<size_t>(input_tensor.numel());
+      constexpr size_t max_blocks_hw = 65535;
+
+      // Worst-case (nvec = 1) upper bound on number of blocks
+      size_t max_blocks = std::min(DIVUP(N, static_cast<size_t>(amax_kernel_threads)), max_blocks_hw);
+
+      // Allocate FP32 workspace for block-wise amax
+      auto ws = at::empty({static_cast<long>(max_blocks)}, at::CUDA(at::kFloat));
+
+      TensorWrapper te_workspace = makeTransformerEngineTensor(ws);
+
+      NVTE_SCOPED_GIL_RELEASE({
+        nvte_compute_amax_with_workspace(input_tensor.data(), out_tensor.data(),
+                          te_workspace.data(), at::cuda::getCurrentCUDAStream());
+      });
+    }
+
     // check if we need to do amax reudction (depending on model parallel configs)
     if (my_quantizer_cs->with_amax_reduction) {
       c10::intrusive_ptr<dist_group_type> process_group_ptr = my_quantizer_cs->amax_reduction_group;

@@ -1,4 +1,6 @@
 /*************************************************************************
+ * This file was modified for portability to AMDGPU
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -52,9 +54,33 @@ py::object quantize(const at::Tensor& tensor, py::handle quantizer, const py::ob
   if (detail::IsFloat8CurrentScalingQuantizers(quantizer.ptr())) {
     // my_quantizer here has to be a Float8CurrentScalingQuantizer
     auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer*>(my_quantizer.get());
-    NVTE_SCOPED_GIL_RELEASE({
-      nvte_compute_amax(te_input.data(), te_output.data(), at::cuda::getCurrentCUDAStream());
-    });
+
+    if (nvte_use_atomic_amax()) {
+       NVTE_SCOPED_GIL_RELEASE({
+        nvte_compute_amax(te_input.data(), te_output.data(),
+            at::cuda::getCurrentCUDAStream());
+      });
+    } else {
+      // Workspace for nvte_compute_amax_with_workspace
+      const auto N = static_cast<size_t>(input_tensor.numel());
+      constexpr size_t max_blocks_hw = 65535;
+
+      // Worst-case (nvec = 1) upper bound on number of blocks
+      size_t max_blocks = std::min(DIVUP(N, static_cast<size_t>(amax_kernel_threads)), max_blocks_hw);
+
+      // Allocate FP32 workspace for block-wise amax
+      auto ws = at::empty({static_cast<long>(max_blocks)},
+                          tensor.options().dtype(at::kFloat));
+
+      TensorWrapper te_workspace = makeTransformerEngineTensor(ws);
+
+      NVTE_SCOPED_GIL_RELEASE({
+        nvte_compute_amax_with_workspace(te_input.data(), te_output.data(),
+            te_workspace.data(),
+            at::cuda::getCurrentCUDAStream());
+      });
+    }
+
     // check if we need to do amax reudction (depending on model parallel configs)
     if (my_quantizer_cs->with_amax_reduction) {
       c10::intrusive_ptr<dist_group_type> process_group_ptr = my_quantizer_cs->amax_reduction_group;