Enable aligned vectorized memory ops for MXFP8 cast (#342)

alextmagro · web-flow · commit 87fece2a8f5b · 2025-11-13T08:49:40.000-06:00
* Enable aligned vectorized memory ops for MXFP8 cast

* Optimized vector sizes and alignment conditions
diff --git a/transformer_engine/common/normalization/common.h b/transformer_engine/common/normalization/common.h
@@ -459,15 +459,20 @@ void rocm_norm_mxfp8_quantize(LaunchParams<ForwardKernelParams> &launch_params)
     scale_dim_Y_colwise, SCALE_DIM_Y,
       TRANSFORMER_ENGINE_TYPE_SWITCH_FP8ONLY(
         launch_params.z_tensor->dtype(), OType,
-        cast_mxfp8_2D_kernel<false, false, false, Empty, {}, compute_t, OType,
-                              SCALE_DIM_Y, scale_dim_X_rowwise, true><<<grid, block, 0, launch_params.stream>>>(
-            reinterpret_cast<const compute_t*>(launch_params.params.z), 
-            nullptr,
-            reinterpret_cast<OType *>(launch_params.z_tensor->data.dptr),
-            reinterpret_cast<OType *>(launch_params.z_tensor->columnwise_data.dptr),
-            scales_rowwise_ptr, scales_colwise_ptr,
-            nullptr, nullptr, nullptr,
-            rows, cols, scale_stride_rowwise, scale_stride_colwise);););
+          TRANSFORMER_ENGINE_SWITCH_CONDITION(
+            !(cols % (32 * sizeof(compute_t))), IS_ALIGNED,
+              cast_mxfp8_2D_kernel<false, false, false, Empty, {}, compute_t, OType,
+                                SCALE_DIM_Y, scale_dim_X_rowwise, IS_ALIGNED, true><<<grid, block, 0, launch_params.stream>>>(
+                reinterpret_cast<const compute_t*>(launch_params.params.z),
+                nullptr,
+                reinterpret_cast<OType *>(launch_params.z_tensor->data.dptr),
+                reinterpret_cast<OType *>(launch_params.z_tensor->columnwise_data.dptr),
+                scales_rowwise_ptr, scales_colwise_ptr,
+                nullptr, nullptr, nullptr,
+                rows, cols, scale_stride_rowwise, scale_stride_colwise);
+          );
+      );
+  );
 }
 #endif 
 
diff --git a/transformer_engine/common/util/cast_gated_kernels.cuh b/transformer_engine/common/util/cast_gated_kernels.cuh
@@ -847,8 +847,9 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
               gated_input.dtype(), IType,
               TRANSFORMER_ENGINE_TYPE_SWITCH_FP8ONLY(
                   output->dtype(), OType,
-
 #ifdef __HIP_PLATFORM_AMD__
+              TRANSFORMER_ENGINE_SWITCH_CONDITION(
+                !(cols % (32 * sizeof(IType))), IS_ALIGNED,
                   const IType *tensor_map_grad = IS_DGATED ? reinterpret_cast<const IType *>(grad.data.dptr) : nullptr;
                   const IType *tensor_map_input_act = reinterpret_cast<const IType *>(gated_input.data.dptr);
                   const IType *tensor_map_input_gate = reinterpret_cast<const IType *>(gated_input.data.dptr) + cols;
@@ -918,11 +919,19 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
 
                   NVTE_CHECK_CUDA(cudaFuncSetAttribute(
                       (const void*)cast_mxfp8_gated_kernel<IS_DGATED, ParamOP, ActOP, DActOP, IType, OType,
-                                              SCALE_DIM_Y, SCALE_DIM_X>,
+                                              SCALE_DIM_Y, SCALE_DIM_X
+#ifdef __HIP_PLATFORM_AMD__
+                                              , IS_ALIGNED
+#endif
+                                              >,
                       cudaFuncAttributeMaxDynamicSharedMemorySize, shmem_size));
 
                   cast_mxfp8_gated_kernel<IS_DGATED, ParamOP, ActOP, DActOP, IType, OType,
-                                          SCALE_DIM_Y, SCALE_DIM_X>
+                                          SCALE_DIM_Y, SCALE_DIM_X
+#ifdef __HIP_PLATFORM_AMD__
+                                              , IS_ALIGNED
+#endif
+                                              >
                   <<<grid_dim, block_dim, shmem_size, stream>>>(
                       tensor_map_grad, tensor_map_input_act, tensor_map_input_gate,
                       tensor_map_output_act_rowwise, tensor_map_output_gate_rowwise,
@@ -932,6 +941,9 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
           );                                    // NOLINT(*)
       );                                        // NOLINT(*)
   );                                            // NOLINT(*)
+#ifdef __HIP_PLATFORM_AMD__
+  );                                            // NOLINT(*)
+#endif
 }
 
 template <typename ParamOP, float (*ActOP)(float, const ParamOP &)>
diff --git a/transformer_engine/common/util/cast_kernels.cuh b/transformer_engine/common/util/cast_kernels.cuh
@@ -999,8 +999,10 @@ void mxfp8_quantize(const Tensor &input, const Tensor *act_input,
               TRANSFORMER_ENGINE_TYPE_SWITCH_FP8ONLY(
                   output->dtype(), OType,
 #ifdef __HIP_PLATFORM_AMD__
-                  cast_mxfp8_2D_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType, OType,
-                                       SCALE_DIM_Y, SCALE_DIM_X><<<grid, block, 0, stream>>>(
+                TRANSFORMER_ENGINE_SWITCH_CONDITION(
+                  !(cols % (32 * sizeof(IType))), IS_ALIGNED,
+                    cast_mxfp8_2D_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType, OType,
+                                         SCALE_DIM_Y, SCALE_DIM_X, IS_ALIGNED><<<grid, block, 0, stream>>>(
                       reinterpret_cast<const IType *>(input.data.dptr), 
                       (IS_DACT) ? reinterpret_cast<const IType *>(act_input->data.dptr) : nullptr,
                       reinterpret_cast<OType *>(output->data.dptr),
@@ -1051,6 +1053,9 @@ void mxfp8_quantize(const Tensor &input, const Tensor *act_input,
           );           // NOLINT(*)
       );               // NOLINT(*)
   );                   // NOLINT(*)
+#ifdef __HIP_PLATFORM_AMD__
+  );                   // NOLINT(*)
+#endif
 }
 
 namespace detail {
diff --git a/transformer_engine/common/util/dequantize_kernels.cuh b/transformer_engine/common/util/dequantize_kernels.cuh
@@ -310,8 +310,10 @@ static void mxfp8_dequantize(const Tensor &input, Tensor *output, cudaStream_t s
               TRANSFORMER_ENGINE_TYPE_SWITCH_NON_FP8ONLY(
                   output->dtype(), OType,
 #ifdef __HIP_PLATFORM_AMD__
-                dequantize_mxfp8_kernel<IType, OType, SCALE_DIM_Y, SCALE_DIM_X>
-                <<<grid, block, 0, stream>>>(reinterpret_cast<const IType *>(input_data.dptr), reinterpret_cast<OType *>(output->data.dptr), scales_ptr,
+              TRANSFORMER_ENGINE_SWITCH_CONDITION(
+                  !(cols % (32 * sizeof(OType))), IS_ALIGNED,
+                  dequantize_mxfp8_kernel<IType, OType, SCALE_DIM_Y, SCALE_DIM_X, IS_ALIGNED>
+                  <<<grid, block, 0, stream>>>(reinterpret_cast<const IType *>(input_data.dptr), reinterpret_cast<OType *>(output->data.dptr), scales_ptr,
                                                rows, cols, scales_stride););  // NOLINT(*)
 #else // #ifdef __HIP_PLATFORM_AMD__
                   alignas(64) CUtensorMap tensor_map_input{};
@@ -329,6 +331,9 @@ static void mxfp8_dequantize(const Tensor &input, Tensor *output, cudaStream_t s
           );                                                                  // NOLINT(*)
       );                                                                      // NOLINT(*)
   );                                                                          // NOLINT(*)
+#ifdef __HIP_PLATFORM_AMD__
+  );                                                                          // NOLINT(*)
+#endif
 }
 }  // namespace dequantization
 
diff --git a/transformer_engine/common/util/rocm_cast_gated_kernels.cuh b/transformer_engine/common/util/rocm_cast_gated_kernels.cuh
@@ -43,7 +43,7 @@ __device__ inline float sigmoidf(const float x) { return __frcp_rn(1.0f + __expf
 
 template <bool IS_DGATED, typename ParamOP, float (*ActOP)(float, const ParamOP &),
           float (*DActOP)(float, const ParamOP &), typename IType, typename OType,
-          size_t SCALE_DIM_Y, size_t SCALE_DIM_X>
+          size_t SCALE_DIM_Y, size_t SCALE_DIM_X, bool IS_ALIGNED>
 __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     cast_mxfp8_gated_kernel(const IType *grad_ptr,
                             const IType *input_act,
@@ -76,7 +76,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
   const int tid_Y = threadIdx.x / THREADS_PER_CHUNK_X;
   const int tid_X = threadIdx.x % THREADS_PER_CHUNK_X;
 
-  constexpr size_t VECTOR_WIDTH = 16 / sizeof(OType);
+  constexpr size_t VECTOR_WIDTH = (IS_ALIGNED ?: 2) * 8 / sizeof(OType);
 
   const int thread_offset_Y = tid_Y;
   const int thread_offset_X = tid_X;
@@ -136,16 +136,16 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
 
     // Initiate bulk tensor copy
     if constexpr (IS_DGATED) {
-      copy_2d_to_shared<IType, VECTOR_WIDTH, false>(&in_grad_sh[0], grad_ptr, chunk_it_offset_x, chunk_it_offset_y,
+      copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_grad_sh[0], grad_ptr, chunk_it_offset_x, chunk_it_offset_y,
                         cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
     }
 
     // Act
-    copy_2d_to_shared<IType, VECTOR_WIDTH, false>(&in_act_sh[0], input_act, chunk_it_offset_x, chunk_it_offset_y,
+    copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_act_sh[0], input_act, chunk_it_offset_x, chunk_it_offset_y,
                       2*cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
     
     // Gate
-    copy_2d_to_shared<IType, VECTOR_WIDTH, false>(&in_gate_sh[0], input_gate, chunk_it_offset_x, chunk_it_offset_y,
+    copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_gate_sh[0], input_gate, chunk_it_offset_x, chunk_it_offset_y,
                       2*cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
 
     __syncthreads();
@@ -347,19 +347,19 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     __syncthreads();
 
     if constexpr (USE_ROWWISE_SCALING) {
-      bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, false>(&out_act_rowwise_sh[0], output_act_rowwise, chunk_it_offset_x,
+      bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_act_rowwise_sh[0], output_act_rowwise, chunk_it_offset_x,
                                       chunk_it_offset_y, output_cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
       if constexpr (IS_DGATED) {
-      bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, false>(&out_gate_rowwise_sh[0], output_gate_rowwise, chunk_it_offset_x,
+      bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_gate_rowwise_sh[0], output_gate_rowwise, chunk_it_offset_x,
                                       chunk_it_offset_y, output_cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
       }
     }
     
     if constexpr (USE_COLWISE_SCALING) {
-      bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, false>(&out_act_colwise_sh[0], output_act_colwise, chunk_it_offset_x,
+      bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_act_colwise_sh[0], output_act_colwise, chunk_it_offset_x,
                                       chunk_it_offset_y, output_cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
       if constexpr (IS_DGATED) {
-      bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, false>(&out_gate_colwise_sh[0], output_gate_colwise, chunk_it_offset_x,
+      bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_gate_colwise_sh[0], output_gate_colwise, chunk_it_offset_x,
                                       chunk_it_offset_y, output_cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
       }
     }
diff --git a/transformer_engine/common/util/rocm_cast_kernels.cuh b/transformer_engine/common/util/rocm_cast_kernels.cuh
@@ -27,9 +27,6 @@ constexpr size_t MXFP8_CHUNKS_PER_BLOCK_Y = 1;
 constexpr size_t MXFP8_CHUNKS_PER_BLOCK_X = 1;
 constexpr size_t MXFP8_CHUNKS_PER_BLOCK = MXFP8_CHUNKS_PER_BLOCK_Y * MXFP8_CHUNKS_PER_BLOCK_X;
 constexpr size_t MXFP8_THREADS_PER_CHUNK = 64;
-constexpr size_t MXFP8_BUFFERS_NUM = 2;
-constexpr size_t MXFP8_PREFETCH_BUFFERS_NUM = 1;
-static_assert(MXFP8_PREFETCH_BUFFERS_NUM < MXFP8_BUFFERS_NUM);
 
 constexpr size_t ELEMS_PER_THREAD = 16;
 constexpr size_t MXFP8_BUFFER_DIM_Y = 32;                 // only 32 is supported
@@ -45,11 +42,10 @@ constexpr size_t THREADS_PER_CHUNK_X_COLWISE = MXFP8_CHUNK_DIM_X;  //  64
 constexpr size_t MXFP8_BUFF_STAGES_NUM =
     MXFP8_BUFFER_DIM_Y / THREADS_PER_CHUNK_Y_ROWWISE;                        //   2 = 32 / 16
 constexpr size_t MXFP8_ITERATIONS = MXFP8_CHUNK_DIM_Y / MXFP8_BUFFER_DIM_Y;  //   2 = 64 / 32
-static_assert(MXFP8_ITERATIONS >= MXFP8_PREFETCH_BUFFERS_NUM);
 
 template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
           float (*OP)(float, const ParamOP &), typename IType, typename OType, size_t SCALE_DIM_Y,
-          size_t SCALE_DIM_X, bool IS_NORM = false>
+          size_t SCALE_DIM_X, bool IS_ALIGNED, bool IS_NORM = false>
 __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
     cast_mxfp8_2D_kernel(const IType *input_ptr,
                          const IType *act_input_ptr,
@@ -83,7 +79,7 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
   constexpr size_t THREADS_PER_SCALE_X_ROWWISE =
       DIVUP(SCALE_DIM_X, ELEMS_PER_THREAD);                      //   2 = 32 / 16
   constexpr size_t SUBWARP_WIDTH = THREADS_PER_SCALE_X_ROWWISE;  //   2
-  constexpr size_t VECTOR_WIDTH = 16 / sizeof(OType);
+  constexpr size_t VECTOR_WIDTH = (IS_ALIGNED ?: 2) * 8 / sizeof(OType);
 
   const int block_offset_Y = blockIdx.y * MXFP8_CHUNKS_PER_BLOCK_Y * MXFP8_CHUNK_DIM_Y;
   const int block_offset_X = blockIdx.x * MXFP8_CHUNKS_PER_BLOCK_X * MXFP8_CHUNK_DIM_X;
@@ -161,11 +157,11 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
       const int chunk_it_offset_x = chunk_offset_X;
       const size_t row_base = chunk_it_offset_y;
       if constexpr (IS_DACT) {
-        copy_2d_to_shared<IType, VECTOR_WIDTH, false>(&act_in_sh[0][0], act_input_ptr, 
+        copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&act_in_sh[0][0], act_input_ptr, 
                           chunk_it_offset_x, chunk_it_offset_y, cols, 
                           MXFP8_SHMEM_DIM_Y, MXFP8_SHMEM_DIM_X, rows, cols);
       }
-      copy_2d_to_shared<IType, VECTOR_WIDTH, false>(&in_sh[0][0], input_ptr, chunk_it_offset_x, 
+      copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_sh[0][0], input_ptr, chunk_it_offset_x, 
                         chunk_it_offset_y, cols, MXFP8_SHMEM_DIM_Y,
                         MXFP8_SHMEM_DIM_X, rows, cols);
       __syncthreads();
@@ -301,12 +297,12 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
       __syncthreads();
 
       if constexpr (USE_ROWWISE_SCALING) {
-        bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, false>(&out_rowwise_sh[0][0], output_rowwise, chunk_it_offset_x,
+        bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_rowwise_sh[0][0], output_rowwise, chunk_it_offset_x,
                                         chunk_it_offset_y, cols, MXFP8_SHMEM_DIM_Y,
                                         MXFP8_SHMEM_DIM_X, rows, cols);
       }
       if constexpr (USE_COLWISE_SCALING) {
-        bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, false>(&out_colwise_sh[0][0], output_colwise, chunk_it_offset_x,
+        bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_colwise_sh[0][0], output_colwise, chunk_it_offset_x,
                                         chunk_it_offset_y, cols, MXFP8_SHMEM_DIM_Y,
                                         MXFP8_SHMEM_DIM_X, rows, cols);
       }
diff --git a/transformer_engine/common/util/rocm_dequantize_kernels.cuh b/transformer_engine/common/util/rocm_dequantize_kernels.cuh
@@ -42,7 +42,7 @@ constexpr size_t THREADS_PER_CHUNK_X_COLWISE = CHUNK_DIM_X;
 constexpr size_t ITERATIONS = CHUNK_DIM_Y / BUFFER_DIM_Y;                       //    8 = 128 / 16
 static_assert(ITERATIONS >= 1);
 
-template <typename IType, typename OType, size_t SCALE_DIM_Y, size_t SCALE_DIM_X>
+template <typename IType, typename OType, size_t SCALE_DIM_Y, size_t SCALE_DIM_X, bool IS_ALIGNED>
 __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     dequantize_mxfp8_kernel(const IType *input_ptr,
                             OType *output_ptr,
@@ -59,7 +59,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
 
   constexpr size_t THREADS_PER_SCALE_X_ROWWISE =
       DIVUP(SCALE_DIM_X, ELEMS_PER_THREAD);                      // 2 = 32 / 16
-  constexpr size_t VECTOR_WIDTH = 16 / sizeof(OType);
+  constexpr size_t VECTOR_WIDTH = (IS_ALIGNED ?: 2) * 8 / sizeof(IType);
 
   const int chunk_offset_Y = blockIdx.y * CHUNK_DIM_Y;
   const int chunk_offset_X = blockIdx.x * CHUNK_DIM_X;
@@ -86,7 +86,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     const int chunk_it_offset_y = chunk_offset_Y + iter * BUFFER_DIM_Y;
     const int chunk_it_offset_x = chunk_offset_X;
 
-    copy_2d_to_shared<IType, VECTOR_WIDTH, false>(&in_sh[0][0], input_ptr, chunk_it_offset_x, 
+    copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_sh[0][0], input_ptr, chunk_it_offset_x, 
                       chunk_it_offset_y, cols, SHMEM_DIM_Y,
                       SHMEM_DIM_X, rows, cols);
     __syncthreads();
@@ -127,7 +127,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
 
     __syncthreads();
 
-    bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, false>(&out_sh[0][0], output_ptr, chunk_it_offset_x,
+    bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_sh[0][0], output_ptr, chunk_it_offset_x,
                                     chunk_it_offset_y, cols, SHMEM_DIM_Y,
                                     SHMEM_DIM_X, rows, cols);
 
diff --git a/transformer_engine/common/util/rocm_vectorized_2d.cuh b/transformer_engine/common/util/rocm_vectorized_2d.cuh
@@ -10,13 +10,11 @@
 
 namespace transformer_engine {
 // These 2d copy functions replace TMA tensormap async copies for AMD GPUs.
-template <typename T, int N_VEC, bool aligned = false>
+template <typename T, int N_VEC, bool ALIGNED_ACCESS>
 __device__ inline void copy_2d_to_shared(T *sh_ptr_base, const T *g_ptr, size_t g_start_col,
                                           size_t g_start_row, size_t g_stride, size_t chunk_dim_y,
                                           size_t chunk_dim_x, size_t total_rows,
                                           size_t total_cols) {
-// TODO: Manage edge cases where "aligned = true" causes into issues
-    constexpr bool ALIGNED_ACCESS = aligned;
     size_t chunk_dim_x_vec_elements = (chunk_dim_x + N_VEC - 1) / N_VEC;
     const size_t l_idx = threadIdx.x;
 
@@ -51,12 +49,11 @@ __device__ inline void copy_2d_to_shared(T *sh_ptr_base, const T *g_ptr, size_t
     }
 }
 
-template <typename T, int N_VEC, bool aligned = false>
+template <typename T, int N_VEC, bool ALIGNED_ACCESS>
 __device__ inline void bulk_tensor_2d_shared_to_global(const T *sh_ptr_base, T *g_ptr, size_t g_start_col,
                                                          size_t g_start_row, size_t g_stride, size_t chunk_dim_y,
                                                          size_t chunk_dim_x, size_t total_rows,
                                                          size_t total_cols) {
-  constexpr bool ALIGNED_ACCESS = aligned;
   const size_t chunk_dim_x_vec_elements = (chunk_dim_x + N_VEC - 1) / N_VEC;
   const size_t l_idx = threadIdx.x;