Fix attention bugs (swap thread and iter when Q LDS is bypassed and bf16 tests) (#1797)

dhernandez0 · web-flow · commit 4a757de61bf1 · 2025-04-01T11:44:41.000+02:00
* Fix some attention bugs:
- do now swap thread and iter subdims for Q if we are bypassing LDS
- use f32 attention in CPU code
- fix bug in maskKVCacheTosa for bf16
diff --git a/mlir/lib/Dialect/Rock/Transforms/GridwiseGemmToBlockwise.cpp b/mlir/lib/Dialect/Rock/Transforms/GridwiseGemmToBlockwise.cpp
@@ -868,6 +868,9 @@ struct GridwiseAttentionAccelRewritePattern
         return failure();
       }
     } else {
+      assert(!ldsLayoutCfg.doSwapThreadIterSubDims &&
+             "doSwapThreadIterSubDims must be false if the destination buffer "
+             "is private memory");
       accel::AccelEmitterParams accelEmitterParams = accelEmitter.getParams();
       int64_t dRepeats = (nonKDimName == "m" ? accelEmitterParams.mRepeats
                                              : accelEmitterParams.nRepeats);
@@ -1754,6 +1757,9 @@ struct GridwiseAttentionAccelRewritePattern
     }
     LDSLayoutConfigDim ldsLayoutCfgNG0 = getLDSLayoutConfigDim(
         elemTypeQ, gemm0kpack, maybeVectorDimInfoQ.value());
+    if (doBypassLDSForQ) {
+      ldsLayoutCfgNG0.doSwapThreadIterSubDims = false;
+    }
     FailureOr<VectorDimInfo> maybeVectorDimInfoK =
         getVectorDim(rewriter, loc, inK, elemTypeK, blockSize, gemm0KPerBlock,
                      gemm0MPerBlock, gemm0kpack);
diff --git a/mlir/test/e2e/PrAttentionBF16.toml b/mlir/test/e2e/PrAttentionBF16.toml
@@ -51,33 +51,33 @@ config = "-seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 -perf_con
 config = "-seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 -perf_config attn:v1:32,32,64,8,16,16,8,1"
 
 # check scale
-#[[suite.test]]
-#config = "-seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale"
+[[suite.test]]
+config = "-seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale"
 
 # check bias
 [[suite.test]]
 config = "-seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-bias"
 
 # check scale and bias together
-#[[suite.test]]
-#config = "-seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
+[[suite.test]]
+config = "-seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
 
 # cross attention
 [[suite.test]]
 config = "-seq_len_q 128 -seq_len_k 27 -head_dim_qk 64 -head_dim_v 32 --with-attn-scale --with-attn-bias"
 
 # issue 1661
-#[[suite.test]]
-#config = "-seq_len_q 1 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
+[[suite.test]]
+config = "-seq_len_q 1 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
 
 # GQA
-#[[suite.test]]
-#config = "-num_heads_q 4 -num_heads_kv 2 -seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
+[[suite.test]]
+config = "-num_heads_q 4 -num_heads_kv 2 -seq_len_q 384 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
 
 # GQA + KV Cache
-#[[suite.test]]
-#config = "-rand 1 -current_seq_len=17 -num_heads_q 4 -num_heads_kv 2 -seq_len_q 1 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
+[[suite.test]]
+config = "-rand 1 -current_seq_len=17 -num_heads_q 4 -num_heads_kv 2 -seq_len_q 1 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
 
 # GQA + KV Cache batch=3
-#[[suite.test]]
-#config = "-rand 1 -current_seq_len=17,1,32 -g 3 -num_heads_q 4 -num_heads_kv 2 -seq_len_q 1 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
+[[suite.test]]
+config = "-rand 1 -current_seq_len=17,1,32 -g 3 -num_heads_q 4 -num_heads_kv 2 -seq_len_q 1 -seq_len_k 384 -head_dim_qk 64 -head_dim_v 64 --with-attn-scale --with-attn-bias"
diff --git a/mlir/test/fusion/nightly-misc-e2e/mixr-attention/f16/rock-attention-issue-1797.mlir b/mlir/test/fusion/nightly-misc-e2e/mixr-attention/f16/rock-attention-issue-1797.mlir
@@ -0,0 +1,4 @@
+// RUN: rocmlir-gen --arch %arch --operation attention -t f16 -seq_len_q 8 -seq_len_k 8 -head_dim_qk 8 -head_dim_v 8 -perf_config attn:v1:32,32,64,8,16,16,8,1 --transQ=true --transK=true --transV=false --transO=false -rand 1 -rand_type int -pv -relDiff_threshold 0.02 -RMS_threshold 0.015 | rocmlir-driver -c | mlir-runner -O2 --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext --entry-point-result=void | FileCheck %s
+// RUN: rocmlir-gen --arch %arch --operation attention -t f16 -seq_len_q 8 -seq_len_k 8 -head_dim_qk 8 -head_dim_v 8 -perf_config attn:v1:32,32,64,8,32,32,8,1 --transQ=true --transK=true --transV=false --transO=false -rand 1 -rand_type int -pv -relDiff_threshold 0.02 -RMS_threshold 0.015 | rocmlir-driver -c | mlir-runner -O2 --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext --entry-point-result=void | FileCheck %s
+
+// CHECK: [1 1 1]
diff --git a/mlir/tools/rocmlir-gen/rocmlir-gen.cpp b/mlir/tools/rocmlir-gen/rocmlir-gen.cpp
@@ -2317,9 +2317,9 @@ getAttentionDimNames(SmallVectorImpl<SmallVector<StringRef>> &result,
   else
     result.emplace_back(SmallVector<StringRef>{gName, seqQName, headQKName});
   if (transposeK)
-    result.emplace_back(SmallVector<StringRef>{gName, headQKName, seqKName});
-  else
     result.emplace_back(SmallVector<StringRef>{gName, seqKName, headQKName});
+  else
+    result.emplace_back(SmallVector<StringRef>{gName, headQKName, seqKName});
   if (transposeV)
     result.emplace_back(SmallVector<StringRef>{gName, headVName, seqKName});
   else
@@ -2369,9 +2369,8 @@ Value addTensorArgToBlock(OpBuilder &builder, Location loc,
   return funcArgTensor;
 }
 
-template <typename T>
 static Value maskKVCacheTosa(OpBuilder builder, Location loc, Value inputTensor,
-                             Value currentSeqLenVal, T initValue) {
+                             Value currentSeqLenVal, float initValue) {
   // inputTensor is [B*NUM_HEADS, SEQ_LEN_Q, SEQ_LEN_KV], we want to reshape to
   // [B, NUM_HEADS, SEQ_LEN_Q, SEQ_LEN_KV]
   auto origType = cast<RankedTensorType>(inputTensor.getType());
@@ -2423,28 +2422,16 @@ static Value maskKVCacheTosa(OpBuilder builder, Location loc, Value inputTensor,
       currentSeqLenBroadcast);
 
   // create a tensor with a single value and broadcast it
-  DenseElementsAttr initValueAttr;
-  if constexpr (std::is_same_v<T, int32_t>) {
-    assert(inpType.getElementType() == builder.getI32Type());
-    initValueAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get(inpShape, inpType.getElementType()), initValue);
-  } else if constexpr (std::is_same_v<T, float>) {
-    assert(inpType.getElementType() == builder.getF32Type() ||
-           inpType.getElementType() == builder.getF16Type());
-    llvm::APFloat fpVal(initValue);
-    if (inpType.getElementType() == builder.getF16Type()) {
-      bool losesInfo = false;
-      auto status =
-          fpVal.convert(llvm::APFloat::IEEEhalf(),
-                        llvm::APFloat::rmNearestTiesToEven, &losesInfo);
-      assert(status == llvm::APFloat::opOK);
-    }
-    initValueAttr = DenseFPElementsAttr::get(
-        RankedTensorType::get(inpShape, inpType.getElementType()), fpVal);
-  } else {
-    static_assert(!std::is_same_v<T, T>,
-                  "Unsupported type for MLIR type mapping");
-  }
+  assert(isa<FloatType>(inpType.getElementType()));
+  std::pair<APFloat, llvm::detail::opStatus> floatRes =
+      rock::createAPFloat(inpType.getElementType(), initValue);
+  APFloat fpVal = floatRes.first;
+  auto status = floatRes.second;
+  assert(status == APFloat::opOK);
+
+  DenseElementsAttr initValueAttr = DenseFPElementsAttr::get(
+      RankedTensorType::get(inpShape, inpType.getElementType()), fpVal);
+
   Value initVal = builder.create<tosa::ConstOp>(loc, initValueAttr.getType(),
                                                 initValueAttr);
 
@@ -2809,6 +2796,18 @@ static Value transposeMatrix(OpBuilder &builder, Location loc, Value src,
   return createOpAndInfer<tosa::TransposeOp>(builder, loc, elemType, src, perm);
 }
 
+static Type getAccType(Type inputType, OpBuilder builder) {
+  Type accType;
+  if (isa<FloatType>(inputType)) {
+    accType = builder.getF32Type();
+  } else if (isa<IntegerType>(inputType)) {
+    accType = builder.getI32Type();
+  } else {
+    llvm_unreachable("not expected type");
+  }
+  return accType;
+}
+
 static func::FuncOp createCpuAttentionKernelWithMlir(ModuleOp module,
                                                      const GenParams &params) {
   MLIRContext *ctx = module.getContext();
@@ -2880,9 +2879,18 @@ static func::FuncOp createCpuAttentionKernelWithMlir(ModuleOp module,
   auto keysZp =
       tosa::createZeroPointTensor(builder, loc, keysTensor.getType(), 0)
           .value();
-  Value qkTensor = createOpAndInfer<tosa::MatMulOp>(
-      builder, loc, firstGemmOutElemType, queriesTensor, keysTensor, queriesZp,
-      keysZp);
+  // TODO: if/when tosa::matmul has acc_type implemented, we can use it here to
+  // be more similar to what the gpu code does
+  // accumulate in 32 bit
+  Type firstAccType = getAccType(firstGemmOutElemType, builder);
+  assert(firstAccType == getAccType(params.types[1], builder));
+  Value qkTensorBeforeConversion = createOpAndInfer<tosa::MatMulOp>(
+      builder, loc, firstAccType, queriesTensor, keysTensor, queriesZp, keysZp);
+  Value qkTensor = builder.createOrFold<tosa::CastOp>(
+      loc,
+      cast<ShapedType>(qkTensorBeforeConversion.getType())
+          .clone(firstGemmOutElemType),
+      qkTensorBeforeConversion);
 
   // get currentSeqLenTensor
   Value currentSeqLenTensor;
@@ -2995,9 +3003,19 @@ static func::FuncOp createCpuAttentionKernelWithMlir(ModuleOp module,
   auto valuesZp =
       tosa::createZeroPointTensor(builder, loc, valuesTensor.getType(), 0)
           .value();
-  Value resultTensor = createOpAndInfer<tosa::MatMulOp>(
-      builder, loc, resultOutElementType, softmaxTensor, valuesTensor,
-      softmaxZp, valuesZp);
+
+  // TODO: if/when tosa::matmul has acc_type implemented, we can use it here to
+  // be more similar to what the gpu code does
+  // accumulate in 32 bit
+  Type secondAccType = getAccType(resultOutElementType, builder);
+  Value resultTensorBeforeConversion = createOpAndInfer<tosa::MatMulOp>(
+      builder, loc, secondAccType, softmaxTensor, valuesTensor, softmaxZp,
+      valuesZp);
+  Value resultTensor = builder.createOrFold<tosa::CastOp>(
+      loc,
+      cast<ShapedType>(resultTensorBeforeConversion.getType())
+          .clone(resultOutElementType),
+      resultTensorBeforeConversion);
 
   if (transposeO) {
     resultTensor = transposeMatrix(builder, loc, resultTensor, {0, 2, 1});