Merge branch 'dev' into speedup-amax-kernel

matthiasdiener · matthiasdiener · commit 8c388cc7ec85 · 2025-11-19T16:19:06.000-06:00
diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
@@ -19,6 +19,8 @@ list(APPEND test_cuda_sources
             test_cast_transpose_dbias.cu
             test_cast_transpose_dbias_dgelu.cu
             test_cast_transpose_dgeglu.cu
+            test_cast_dbias.cu
+            test_cast_dbias_dgelu.cu
             test_act.cu
             test_normalization.cu
             test_normalization_mxfp8.cu
@@ -29,9 +31,7 @@ list(APPEND test_cuda_sources
 	          ../test_common.cu)
 if(USE_CUDA)
   list(APPEND test_cuda_sources
-              test_cast_float8blockwise.cu
-              test_cast_dbias.cu
-              test_cast_dbias_dgelu.cu)
+              test_cast_float8blockwise.cu)
 else()
   list(APPEND test_cuda_sources
               test_cublaslt_gemm.cu)
diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu
@@ -149,10 +149,12 @@ class CastDBiasTestSuite : public ::testing::TestWithParam<std::tuple<transforme
 TEST_P(CastDBiasTestSuite, TestCastDBias) {
     using namespace transformer_engine;
     using namespace test;
+#ifndef __HIP_PLATFORM_AMD__
     // Skip tests for pre-Blackwell architectures
     if (getDeviceComputeCapability() < blackwellComputeCapability) {
         GTEST_SKIP();
     }
+#endif
 
     const DType input_type = std::get<0>(GetParam());
     const DType output_type = std::get<1>(GetParam());
diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -164,10 +164,13 @@ class CastDBiasDGeluTestSuite : public ::testing::TestWithParam<std::tuple<trans
 TEST_P(CastDBiasDGeluTestSuite, TestCastDBiasDgelu) {
     using namespace transformer_engine;
     using namespace test;
+
+#ifndef __HIP_PLATFORM_AMD__
     // Skip tests for pre-Blackwell architectures
     if (getDeviceComputeCapability() < blackwellComputeCapability) {
         GTEST_SKIP();
     }
+#endif
 
     const DType input_type = std::get<0>(GetParam());
     const DType output_type = std::get<1>(GetParam());
diff --git a/tests/cpp/operator/test_cast_float8blockwise.cu b/tests/cpp/operator/test_cast_float8blockwise.cu
@@ -478,9 +478,11 @@ class FusedCastFloat8VectorwiseTestSuite
   }
 
 TEST_P(FusedCastFloat8BlockwiseTestSuite, TestFusedCastFloat8Blockwise) {
+#ifndef __HIP_PLATFORM_AMD__
   if (getDeviceComputeCapability() < hopperComputeCapability) {
     GTEST_SKIP();
   }
+#endif
 
   using namespace transformer_engine;
   using namespace test;
@@ -529,9 +531,11 @@ TEST_P(FusedCastFloat8BlockwiseTestSuite, TestFusedCastFloat8Blockwise) {
 }
 
 TEST_P(FusedCastFloat8VectorwiseTestSuite, TestFusedCastFloat8Vectorwise) {
+#ifndef __HIP_PLATFORM_AMD__
   if (getDeviceComputeCapability() < hopperComputeCapability) {
     GTEST_SKIP();
   }
+#endif
 
   using namespace transformer_engine;
   using namespace test;
diff --git a/tests/cpp/operator/test_normalization.cu b/tests/cpp/operator/test_normalization.cu
@@ -35,9 +35,11 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
     return;
   }
 
+#ifndef __HIP_PLATFORM_AMD__
   if (getDeviceComputeCapability() < blackwellComputeCapability && use_cudnn) {
     GTEST_SKIP() << "cuDNN normalizations not supported on pre-Blackwell GPUs yet!";
   }
+#endif
 
   using WeightType = InputType;
   DType itype = TypeInfo<InputType>::dtype;
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
@@ -531,13 +531,13 @@ void compareResults_sequential(const std::string &name, const Tensor &test,
     const T *test_data = rowwise ? test.rowwise_cpu_dptr<T>() : test.columnwise_cpu_dptr<T>();
     const T *ref_data = reinterpret_cast<const T*>(ref);
     for (size_t i = 0; i < N; ++i) {
-      #ifndef __HIP_PLATFORM_AMD__
+#ifndef __HIP_PLATFORM_AMD__
       double t = static_cast<double>(test_data[i]);
       double r = static_cast<double>(ref_data[i]);
-      #else
+#else
       double t = static_cast<double>(static_cast<float>(test_data[i]));
       double r = static_cast<double>(static_cast<float>(ref_data[i]));
-      #endif
+#endif
       bool mismatch = fabs(t - r) > atol && (r == 0 || fabs((t - r) / r) > rtol);
       /* For Float32 the floating point comparison is enough to error out */
       bool assertion = mismatch && test.dtype() == DType::kFloat32;
diff --git a/transformer_engine/common/util/cast_kernels.cuh b/transformer_engine/common/util/cast_kernels.cuh
@@ -1206,7 +1206,6 @@ void fp8_quantize_arch_ge_100(const Tensor &input, const Tensor *act_input, cons
       NVTE_ERROR("Not implemented scaling mode: " + to_string(output->scaling_mode) + ".");
   }
 }
-#endif //#ifndef __HIP_PLATFORM_AMD__
 
 // Supported by the Arch < 10.0
 template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
@@ -1232,6 +1231,7 @@ void fp8_quantize_arch_l_100(const Tensor &input, const Tensor *act_input, const
       NVTE_ERROR("Not implemented scaling mode: " + to_string(output->scaling_mode) + ".");
   }
 }
+#endif //#ifndef __HIP_PLATFORM_AMD__
 
 template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
           float (*OP)(float, const ParamOP &)>
@@ -1256,17 +1256,19 @@ void fp8_quantize(const Tensor &input, const Tensor *act_input, const Tensor *no
   NVTE_CHECK(output->data.shape == input.data.shape, "Input and output shapes need to match.");
 
 #ifndef __HIP_PLATFORM_AMD__
+  // NVIDIA
   // Supported by the Arch >= 10.0
   if (is_supported_by_CC_100()) {
     fp8_quantize_arch_ge_100<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP>(input, act_input, noop, output,
                                                                      dbias, workspace, stream);
-  } else {
-#endif //#ifndef __HIP_PLATFORM_AMD__
-    // Supported by the Arch < 10.0
+  } else { // Supported by the Arch < 10.0
     fp8_quantize_arch_l_100<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP>(input, act_input, noop, output,
                                                                     dbias, workspace, stream);
-#ifndef __HIP_PLATFORM_AMD__
   }
+#else
+  // AMD
+  fp8_quantize_rocm<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP>(input, act_input, noop, output,
+                                                            dbias, workspace, stream);
 #endif //#ifndef __HIP_PLATFORM_AMD__
 }
 
diff --git a/transformer_engine/common/util/rocm_cast_kernels.cuh b/transformer_engine/common/util/rocm_cast_kernels.cuh
@@ -381,4 +381,163 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
   }
 }
 
+// Forward declaration of functions defined in `cast_kernels.cuh`
+template <typename IType>
+void reduce_dbias(const float *workspace_ptr, Tensor *dbias, const size_t rows, const size_t cols,
+                  cudaStream_t stream);
+
+template <typename ParamOP, float (*OP)(float, const ParamOP &)>
+void CastVectorizedUnaryKernelLauncher(const Tensor &input, const Tensor *noop, Tensor *output,
+                                       cudaStream_t stream);
+
+template <typename ParamOP, float (*OP)(float, const ParamOP &)>
+void CastVectorizedUnaryGradKernelLauncher(const Tensor &grad, const Tensor *input, Tensor *output,
+                                           cudaStream_t stream);
+
+constexpr size_t TILE_DIM = 32;
+template <typename DTypeReduce>
+__global__ void partial_reduce_kernel(const DTypeReduce* input, float* partial_output, int rows, int cols) {
+  __shared__ float tile[TILE_DIM][TILE_DIM];
+
+  int tile_start_col = blockIdx.x * TILE_DIM;
+  int tile_start_row = blockIdx.y * TILE_DIM;
+  int thread_col_in_tile = threadIdx.x;
+  int thread_row_in_tile = threadIdx.y;
+
+  int global_col = tile_start_col + thread_col_in_tile;
+  int global_row = tile_start_row + thread_row_in_tile;
+
+  if (global_row < rows && global_col < cols) {
+    tile[thread_row_in_tile][thread_col_in_tile] = static_cast<float>(input[global_row * cols + global_col]);
+  } else {
+    tile[thread_row_in_tile][thread_col_in_tile] = 0.0f;
+  }
+  __syncthreads();
+
+  for (int stride = TILE_DIM / 2; stride > 0; stride /= 2) {
+    if (thread_row_in_tile < stride) {
+      tile[thread_row_in_tile][thread_col_in_tile] += tile[thread_row_in_tile + stride][thread_col_in_tile];
+    }
+    __syncthreads();
+  }
+
+  if (thread_row_in_tile == 0 && global_col < cols) {
+    partial_output[blockIdx.y * cols + global_col] = tile[0][thread_col_in_tile];
+  }
+}
+
+template <typename DTypeReduce, typename DBiasTypeOut>
+void reduce_dbias_rocm(const DTypeReduce *workspace_ptr, Tensor *dbias, const size_t rows,
+                       const size_t cols, cudaStream_t stream, Tensor* partial_sum_workspace) {
+  dim3 block_dim_partial(TILE_DIM, TILE_DIM);
+  dim3 grid_dim_partial(DIVUP(cols, TILE_DIM), DIVUP(rows, TILE_DIM));
+
+  const size_t partial_rows = grid_dim_partial.y;
+  float* partial_workspace = reinterpret_cast<float*>(partial_sum_workspace->data.dptr);
+
+  partial_reduce_kernel<DTypeReduce><<<grid_dim_partial, block_dim_partial, 0, stream>>>(
+    workspace_ptr,
+    partial_workspace,
+    rows, cols);
+
+  reduce_dbias<DBiasTypeOut>(partial_workspace, dbias, partial_rows, cols, stream);
+}
+
+template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
+          float (*OP)(float, const ParamOP &)>
+void fp8_quantize_rocm(const Tensor &input, const Tensor *act_input, const Tensor *noop,
+                             Tensor *output, Tensor *dbias, Tensor *workspace,
+                             cudaStream_t stream) {
+  switch (output->scaling_mode) {
+    case NVTE_DELAYED_TENSOR_SCALING: {
+      const size_t rows = input.flat_first_dim();
+      const size_t cols = input.flat_last_dim();
+
+      if constexpr (IS_DBIAS) {
+        NVTE_CHECK(dbias, "DBias tensor must be provided when IS_DBIAS is true.");
+        NVTE_CHECK(workspace, "Workspace must be provided when IS_DBIAS is true.");
+        if (workspace->data.dptr == nullptr) {
+          if constexpr (IS_DACT) {
+            const size_t partial_rows = DIVUP(rows, TILE_DIM);
+            size_t total_elements = (rows * cols) + (partial_rows * cols);
+            workspace->data.shape = {total_elements};
+            workspace->data.dtype = DType::kFloat32;
+          } else {
+            workspace->data.shape = {rows, cols};
+            workspace->data.dtype = DType::kFloat32;
+          }
+          return;
+        }
+
+        const void *ptr_to_reduce = nullptr;
+        DType dtype_to_reduce;
+
+        workspace->amax = {};
+        workspace->scale = {};
+        workspace->scale_inv = {};
+
+        Tensor workspace_buffer;
+        Tensor partial_sum_buffer;
+
+        if constexpr (IS_DACT) {
+          // The values to reduce are the result of the dAct function.
+          NVTE_CHECK(act_input, "Gradient tensor must be provided for DBias + DACT.");
+
+          const size_t partial_rows = DIVUP(rows, TILE_DIM);
+          const size_t full_size_bytes = rows * cols * sizeof(float);
+          workspace_buffer = *workspace;
+          workspace_buffer.data.shape = {rows, cols};
+          partial_sum_buffer.data.dptr = reinterpret_cast<char*>(workspace->data.dptr) + full_size_bytes;
+          partial_sum_buffer.data.shape = {partial_rows, cols};
+          partial_sum_buffer.data.dtype = DType::kFloat32;
+          workspace = &partial_sum_buffer;
+
+          CastVectorizedUnaryGradKernelLauncher<ParamOP, OP>(input, act_input, &workspace_buffer, stream);
+          if (output && output->data.dptr) {
+            CastVectorizedUnaryKernelLauncher<transformer_engine::Empty, nullptr>(workspace_buffer, noop, output, stream);
+          }
+          ptr_to_reduce = workspace_buffer.data.dptr;
+          dtype_to_reduce = workspace_buffer.data.dtype;
+        } else {
+          if (output && output->data.dptr) {
+            CastVectorizedUnaryKernelLauncher<ParamOP, OP>(input, noop, output, stream);
+          }
+          // The values to reduce are just the input values.
+          ptr_to_reduce = input.data.dptr;
+          dtype_to_reduce = input.data.dtype;
+        }
+
+        NVTE_CHECK(dbias->data.shape == std::vector<size_t>{cols}, "Wrong shape of DBias tensor.");
+
+        TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
+            dbias->data.dtype, DBiasTypeOut,
+            TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
+              dtype_to_reduce, DTypeReduce,
+              reduce_dbias_rocm<DTypeReduce, DBiasTypeOut>(
+                reinterpret_cast<const DTypeReduce *>(ptr_to_reduce),
+                dbias, rows, cols, stream, workspace);
+            );
+        );
+      } else {
+        if (output && output->data.dptr) {
+          if constexpr (IS_DACT) {
+            NVTE_CHECK(act_input, "Gradient tensor must be provided for DACT output.");
+            CastVectorizedUnaryGradKernelLauncher<ParamOP, OP>(input, act_input, output, stream);
+          } else {
+            CastVectorizedUnaryKernelLauncher<ParamOP, OP>(input, noop, output, stream);
+          }
+        }
+      }
+      break;
+    }
+    case NVTE_MXFP8_1D_SCALING: {
+      mxfp8_quantize<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP>(input, act_input, noop, output, dbias,
+                                                             workspace, stream);
+      break;
+    }
+    default:
+      NVTE_ERROR("Not implemented scaling mode: " + to_string(output->scaling_mode) + ".");
+  }
+}
+
 } // namespace transformer_engine

Original file line number	Diff line number	Diff line change
`@@ -478,9 +478,11 @@ class FusedCastFloat8VectorwiseTestSuite`
`478`	`478`	`}`
`479`	`479`
`480`	`480`	`TEST_P(FusedCastFloat8BlockwiseTestSuite, TestFusedCastFloat8Blockwise) {`
	`481`	`+#ifndef __HIP_PLATFORM_AMD__`
`481`	`482`	`if (getDeviceComputeCapability() < hopperComputeCapability) {`
`482`	`483`	`GTEST_SKIP();`
`483`	`484`	`}`
	`485`	`+#endif`
`484`	`486`
`485`	`487`	`using namespace transformer_engine;`
`486`	`488`	`using namespace test;`
`@@ -529,9 +531,11 @@ TEST_P(FusedCastFloat8BlockwiseTestSuite, TestFusedCastFloat8Blockwise) {`
`529`	`531`	`}`
`530`	`532`
`531`	`533`	`TEST_P(FusedCastFloat8VectorwiseTestSuite, TestFusedCastFloat8Vectorwise) {`
	`534`	`+#ifndef __HIP_PLATFORM_AMD__`
`532`	`535`	`if (getDeviceComputeCapability() < hopperComputeCapability) {`
`533`	`536`	`GTEST_SKIP();`
`534`	`537`	`}`
	`538`	`+#endif`
`535`	`539`
`536`	`540`	`using namespace transformer_engine;`
`537`	`541`	`using namespace test;`
Original file line number	Diff line number	Diff line change
`@@ -35,9 +35,11 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,`
`35`	`35`	`return;`
`36`	`36`	`}`
`37`	`37`
	`38`	`+#ifndef __HIP_PLATFORM_AMD__`
`38`	`39`	`if (getDeviceComputeCapability() < blackwellComputeCapability && use_cudnn) {`
`39`	`40`	`GTEST_SKIP() << "cuDNN normalizations not supported on pre-Blackwell GPUs yet!";`
`40`	`41`	`}`
	`42`	`+#endif`
`41`	`43`
`42`	`44`	`using WeightType = InputType;`
`43`	`45`	`DType itype = TypeInfo<InputType>::dtype;`
Original file line number	Diff line number	Diff line change
`@@ -1206,7 +1206,6 @@ void fp8_quantize_arch_ge_100(const Tensor &input, const Tensor *act_input, cons`
`1206`	`1206`	`NVTE_ERROR("Not implemented scaling mode: " + to_string(output->scaling_mode) + ".");`
`1207`	`1207`	`}`
`1208`	`1208`	`}`
`1209`		`-#endif //#ifndef __HIP_PLATFORM_AMD__`
`1210`	`1209`
`1211`	`1210`	`// Supported by the Arch < 10.0`
`1212`	`1211`	`template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,`
`@@ -1232,6 +1231,7 @@ void fp8_quantize_arch_l_100(const Tensor &input, const Tensor *act_input, const`
`1232`	`1231`	`NVTE_ERROR("Not implemented scaling mode: " + to_string(output->scaling_mode) + ".");`
`1233`	`1232`	`}`
`1234`	`1233`	`}`
	`1234`	`+#endif //#ifndef __HIP_PLATFORM_AMD__`
`1235`	`1235`
`1236`	`1236`	`template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,`
`1237`	`1237`	`float (*OP)(float, const ParamOP &)>`
`@@ -1256,17 +1256,19 @@ void fp8_quantize(const Tensor &input, const Tensor act_input, const Tensor no`
`1256`	`1256`	`NVTE_CHECK(output->data.shape == input.data.shape, "Input and output shapes need to match.");`
`1257`	`1257`
`1258`	`1258`	`#ifndef __HIP_PLATFORM_AMD__`
	`1259`	`+ // NVIDIA`
`1259`	`1260`	`// Supported by the Arch >= 10.0`
`1260`	`1261`	`if (is_supported_by_CC_100()) {`
`1261`	`1262`	`fp8_quantize_arch_ge_100<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP>(input, act_input, noop, output,`
`1262`	`1263`	`dbias, workspace, stream);`
`1263`		`- } else {`
`1264`		`-#endif //#ifndef __HIP_PLATFORM_AMD__`
`1265`		`- // Supported by the Arch < 10.0`
	`1264`	`+ } else { // Supported by the Arch < 10.0`
`1266`	`1265`	`fp8_quantize_arch_l_100<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP>(input, act_input, noop, output,`
`1267`	`1266`	`dbias, workspace, stream);`
`1268`		`-#ifndef __HIP_PLATFORM_AMD__`
`1269`	`1267`	`}`
	`1268`	`+#else`
	`1269`	`+ // AMD`
	`1270`	`+ fp8_quantize_rocm<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP>(input, act_input, noop, output,`
	`1271`	`+ dbias, workspace, stream);`
`1270`	`1272`	`#endif //#ifndef __HIP_PLATFORM_AMD__`
`1271`	`1273`	`}`
`1272`	`1274`