fix

pggPL · pggPL · commit fb027d0481dd · 2025-12-30T12:15:08.000+01:00
Signed-off-by: Pawel Gadzinski &lt;pgadzinski@nvidia.com&gt;
diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu
@@ -279,8 +279,6 @@ Tensor make_fp8_operand(const std::string& name, const std::vector<size_t>& shap
 
 Tensor make_bf16_operand(const std::string& name, const std::vector<size_t>& shape) {
   Tensor t(name, shape, DType::kBFloat16);
-  // Fill with ones for easier debugging
-  //fillUniform(&t);
   const size_t numel = shape[0] * shape[1];
   std::vector<__nv_bfloat16> ones(numel, __float2bfloat16(1.0f));
   NVTE_CHECK_CUDA(cudaMemcpy(t.rowwise_dptr(), ones.data(),
@@ -293,8 +291,7 @@ struct TestParams {
   bool transa;
   bool transb;
   ShapeCase shape_case;
-  bool use_null_c = false;           // When true, pass nullptr for C (valid when beta=0)
-  bool use_split_accumulator = false; // Whether to use split accumulator for FP8 GEMM
+  bool use_null_c = false;  // When true, pass nullptr for C (valid when beta=0)
 };
 
 // Returns a vector of (M, N, K) tuples for each GEMM in the group.
@@ -397,7 +394,7 @@ void run_grouped_gemm_case(const TestParams& params) {
                          false,  // grad
                          workspace_ptrs.data(),
                          false,  // accumulate
-                         params.use_split_accumulator,
+                         false,  // use_split_accumulator
                          0,      // sm_count
                          0);
 
@@ -450,10 +447,6 @@ void run_grouped_gemm_case(const TestParams& params) {
   Tensor setup_ws("setup_ws", std::vector<size_t>{setup_ws_bytes}, DType::kByte);
   Tensor cublas_ws("cublas_ws", std::vector<size_t>{cublas_ws_bytes}, DType::kByte);
 
-  // Create config with use_split_accumulator setting
-  transformer_engine::GroupedMatmulConfigWrapper config;
-  config.set_use_split_accumulator(params.use_split_accumulator);
-
   nvte_grouped_gemm(params.transa,
                     params.transb,
                     alpha_tensor.data(),
@@ -464,7 +457,7 @@ void run_grouped_gemm_case(const TestParams& params) {
                     grouped_D.get_handle(),
                     setup_ws.data(),
                     cublas_ws.data(),
-                    config,
+                    nullptr,  // config (use defaults)
                     0);
 
   for (size_t i = 0; i < num_gemms; ++i) {
@@ -502,29 +495,22 @@ std::string MakeGroupedGemmTestName(const testing::TestParamInfo<GroupedGemmTest
   const std::string layout = std::string("ta") + (info.param.transa ? "T" : "N") +
                              "tb" + (info.param.transb ? "T" : "N");
   const std::string null_c = info.param.use_null_c ? "_NullC" : "";
-  const std::string split_acc = info.param.use_split_accumulator ? "_SplitAcc" : "";
   return std::string(kInputNames[static_cast<int>(info.param.input_case)]) + "_" +
-         kShapeNames[static_cast<int>(info.param.shape_case)] + "_" + layout + null_c + split_acc;
+         kShapeNames[static_cast<int>(info.param.shape_case)] + "_" + layout + null_c;
 }
 
-// TestParams: {input_case, transa, transb, shape_case, use_null_c, use_split_accumulator}
+// TestParams: {input_case, transa, transb, shape_case, use_null_c}
 const std::vector<TestParams> kTestParams = {
-    // Basic tests (no split accumulator)
-    {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false, false},
-    {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false, false},
-    {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false, false},
-    {InputCase::kBF16, true, false, ShapeCase::kSameFirst, false, false},
-    {InputCase::kBF16, false, true, ShapeCase::kSameLast, false, false},
-    {InputCase::kBF16, false, false, ShapeCase::kAllSame, false, false},
-    {InputCase::kBF16, true, true, ShapeCase::kAllDifferent, false, false},
+    // Basic tests
+    {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false},
+    {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false},
+    {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false},
+    {InputCase::kBF16, true, false, ShapeCase::kSameFirst, false},
+    {InputCase::kBF16, false, true, ShapeCase::kSameLast, false},
+    {InputCase::kBF16, false, false, ShapeCase::kAllSame, false},
+    {InputCase::kBF16, true, true, ShapeCase::kAllDifferent, false},
     // Test NULL C (valid when beta=0)
-    {InputCase::kBF16, false, false, ShapeCase::kAllSame, true, false},
-
-    // Split accumulator tests
-    {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false, true},
-    {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false, true},
-    {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false, true},
-    {InputCase::kFP8Current, true, false, ShapeCase::kSameFirst, false, true},
+    {InputCase::kBF16, false, false, ShapeCase::kAllSame, true},
 };
 
 INSTANTIATE_TEST_SUITE_P(OperatorTest,
diff --git a/transformer_engine/common/gemm/config.cpp b/transformer_engine/common/gemm/config.cpp
@@ -154,9 +154,6 @@ void nvte_get_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config,
     case kNVTEGroupedMatmulConfigAvgK:
       std::memcpy(buf, &config_.avg_k, attr_size);
       break;
-    case kNVTEGroupedMatmulConfigUseSplitAccumulator:
-      std::memcpy(buf, &config_.use_split_accumulator, attr_size);
-      break;
     case kNVTEGroupedMatmulConfigSMCount:
       std::memcpy(buf, &config_.sm_count, attr_size);
       break;
@@ -195,9 +192,6 @@ void nvte_set_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config,
       std::memcpy(&config_.avg_k, buf, attr_size);
       config_.avg_k_set = true;
       break;
-    case kNVTEGroupedMatmulConfigUseSplitAccumulator:
-      std::memcpy(&config_.use_split_accumulator, buf, attr_size);
-      break;
     case kNVTEGroupedMatmulConfigSMCount:
       std::memcpy(&config_.sm_count, buf, attr_size);
       break;
diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h
@@ -40,9 +40,6 @@ struct GroupedMatmulConfig {
   int64_t avg_n = 0;
   int64_t avg_k = 0;
 
-  // Whether to use split accumulator for FP8 GEMM (more accurate but slower)
-  bool use_split_accumulator = true;
-
   // Number of streaming multiprocessors to use in GEMM kernel
   int sm_count = 0;
 
@@ -55,7 +52,6 @@ struct GroupedMatmulConfig {
       sizeof(int64_t),  // avg_m
       sizeof(int64_t),  // avg_n
       sizeof(int64_t),  // avg_k
-      sizeof(bool),     // use_split_accumulator
       sizeof(int)       // sm_count
   };
 };
diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu
@@ -310,15 +310,17 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA,
 
   // For column-major layout: leading dimension is the number of rows in storage.
   // If columnwise data was chosen, storage is already transposed.
-  int *rowa = A_sel.use_columnwise ? ws.M : (A_sel.trans ? ws.K : ws.M);
-  int *cola = A_sel.use_columnwise ? ws.K : (A_sel.trans ? ws.M : ws.K);
-  int *lda = rowa;
-  int *rowb = B_sel.use_columnwise ? ws.N : (B_sel.trans ? ws.N : ws.K);
-  int *colb = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N);
-  int *ldb = rowb;
-
-  NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rowa, cola, lda));
-  NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rowb, colb, ldb));
+  // Storage dimensions for A: rows_A x cols_A with leading dimension lda_storage
+  int *rows_A = A_sel.use_columnwise ? ws.M : (A_sel.trans ? ws.K : ws.M);
+  int *cols_A = A_sel.use_columnwise ? ws.K : (A_sel.trans ? ws.M : ws.K);
+  int *lda_storage = rows_A;
+  // Storage dimensions for B: rows_B x cols_B with leading dimension ldb_storage
+  int *rows_B = B_sel.use_columnwise ? ws.N : (B_sel.trans ? ws.N : ws.K);
+  int *cols_B = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N);
+  int *ldb_storage = rows_B;
+
+  NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rows_A, cols_A, lda_storage));
+  NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rows_B, cols_B, ldb_storage));
   NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, ws.M, ws.N, ws.M));
   NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, ws.M, ws.N, ws.M));
 }
@@ -442,14 +444,15 @@ __global__ void setup_grouped_gemm_kernel(
       D_meta.offsets ? D_meta.offsets[idx] : (idx * D_meta.uniform_first * D_meta.uniform_last);
 
   // Compute data pointers
+  // Note: const_cast is safe here - cuBLAS requires void** but won't modify A/B/C data
   A_ptrs[idx] = const_cast<char *>(a_base) + a_offset * a_elem_size;
   B_ptrs[idx] = const_cast<char *>(b_base) + b_offset * b_elem_size;
   C_ptrs[idx] = const_cast<char *>(c_base) + c_offset * c_elem_size;
   D_ptrs[idx] = d_base + d_offset * d_elem_size;
 
-  // Compute M, N, K dimensions
-  // Test stores A as {K,M} when !transa, {M,K} when transa
-  // Test stores B as {N,K} when !transb, {K,N} when transb
+  // Compute M, N, K dimensions from tensor shapes
+  // Input A is stored as {K,M} when !transa, {M,K} when transa
+  // Input B is stored as {N,K} when !transb, {K,N} when transb
   M[idx] = static_cast<int>(transa ? a_first : a_last);
   K[idx] = static_cast<int>(transa ? a_last : a_first);
   N[idx] = static_cast<int>(transb ? b_last : b_first);
@@ -570,9 +573,11 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT
 
   // Set fast accumulation mode for FP8
   // Fast accumulation: 0 = split accumulator (more accurate), 1 = fast accumulator
+  // Note: cuBLASLt grouped GEMM API does not support configurable split accumulator,
+  // we always use fast accumulator for performance.
   const bool is_fp8 = is_fp8_dtype(A_sel.dtype) || is_fp8_dtype(B_sel.dtype);
   if (is_fp8) {
-    int8_t fastAccuMode = config_.use_split_accumulator ? 0 : 1;
+    int8_t fastAccuMode = 1;  // Always use fast accumulator
     NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
         &matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(fastAccuMode)));
   }
diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h
@@ -82,10 +82,8 @@ enum NVTEGroupedMatmulConfigAttribute {
    * computed automatically from A's logical shape.
    */
   kNVTEGroupedMatmulConfigAvgK = 2,
-  /*! Whether to use split accumulator for FP8 GEMM. */
-  kNVTEGroupedMatmulConfigUseSplitAccumulator = 3,
   /*! Number of streaming multiprocessors to use in GEMM kernel. */
-  kNVTEGroupedMatmulConfigSMCount = 4,
+  kNVTEGroupedMatmulConfigSMCount = 3,
   kNVTEGroupedMatmulConfigNumAttributes
 };
 
@@ -487,12 +485,6 @@ class GroupedMatmulConfigWrapper {
                                              sizeof(int64_t));
   }
 
-  /*! \brief Set whether to use split accumulator for FP8 GEMM. */
-  void set_use_split_accumulator(bool use_split_accumulator) {
-    nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigUseSplitAccumulator,
-                                             &use_split_accumulator, sizeof(bool));
-  }
-
   /*! \brief Set number of streaming multiprocessors to use. */
   void set_sm_count(int sm_count) {
     nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigSMCount,