Support causal flash attention (#2425)

jopperm · web-flow · commit fb19a56ac8f7 · 2024-10-08T10:26:03.000-04:00
This PR adds support for causal FA:
- Keeps encoding on row-vector tensor operations, as must be left
untouched when lowering to the SIMT program.
- Extends the pattern matching helper that determines whether a tensor
is transposed, to look through advance operations. (The second attention
loop uses a transposed tensor pointer that is `tt.advance`'d between the
loops.)

---------

Signed-off-by: Julian Oppermann &lt;julian.oppermann@codeplay.com&gt;
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py
@@ -269,23 +269,18 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, provider):
             quantiles=quantiles)
 
     elif provider == 'triton':
-        # FIXME: remove below if condition when extend attention support for Causal = True done
-        # https://github.com/intel/intel-xpu-backend-for-triton/issues/1102
-        if os.environ.get('TRITON_INTEL_ADVANCED_PATH', '0') == '1' and CAUSAL:
-            min_ms, max_ms, mean, cv = (float('inf'), ) * 4
+        triton_fn = lambda: forward(q, k, v, CAUSAL, sm_scale)
+        if benchmark_suit.USE_IPEX_OPTION:
+            torch_fn = lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale).to(torch.float32)
         else:
-            triton_fn = lambda: forward(q, k, v, CAUSAL, sm_scale)
-            if benchmark_suit.USE_IPEX_OPTION:
-                torch_fn = lambda: torch.nn.functional.scaled_dot_product_attention(
-                    q, k, v, attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale).to(torch.float32)
-            else:
-                # FIXME: use torch sdpa for result check after https://github.com/intel/intel-xpu-backend-for-triton/issues/2042 fixed
-                torch_fn = lambda: torch.nn.functional.scaled_dot_product_attention(q.cpu(), k.cpu(), v.cpu(
-                ), attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale).to(torch.float32)
-            atol = 1e-1 if N_CTX == 16384 else 1e-2
-            benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=atol, rtol=1e-3, err_msg='triton to torch')
-            _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, warmup=10, rep=10, quantiles=quantiles,
-                                                                  kernel_name='_attn_fwd')
+            # FIXME: use torch sdpa for result check after https://github.com/intel/intel-xpu-backend-for-triton/issues/2042 fixed
+            torch_fn = lambda: torch.nn.functional.scaled_dot_product_attention(q.cpu(), k.cpu(), v.cpu(
+            ), attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale).to(torch.float32)
+        atol = 1e-1 if N_CTX == 16384 else 1e-2
+        benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=atol, rtol=1e-3, err_msg='triton to torch')
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, warmup=10, rep=10, quantiles=quantiles,
+                                                              kernel_name='_attn_fwd')
 
     elif provider == 'xetla':
         module_name = f'flash_attn_causal_{CAUSAL}'.lower()
diff --git a/test/TritonIntelGPU/match-target-size.mlir b/test/TritonIntelGPU/match-target-size.mlir
@@ -537,14 +537,14 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     // CHECK: %[[BC1:.*]] = triton_intel_gpu.broadcast %[[ED1]] : tensor<16x1xi32, #warp> -> tensor<16x16xi32>
     %4 = triton_intel_gpu.broadcast %2 : tensor<16x1xi32, #warp> -> tensor<16x64xi32, #warp>
 
-    // CHECK: %[[EX0:.*]] = triton_intel_gpu.extract %[[ED2]][0] : tensor<1x64xi32, #warp> -> tensor<1x16xi32>
-    // CHECK: %[[BC20:.*]] = triton_intel_gpu.broadcast %[[EX0]] : tensor<1x16xi32> -> tensor<16x16xi32>
-    // CHECK: %[[EX1:.*]] = triton_intel_gpu.extract %[[ED2]][1] : tensor<1x64xi32, #warp> -> tensor<1x16xi32>
-    // CHECK: %[[BC21:.*]] = triton_intel_gpu.broadcast %[[EX1]] : tensor<1x16xi32> -> tensor<16x16xi32>
-    // CHECK: %[[EX2:.*]] = triton_intel_gpu.extract %[[ED2]][2] : tensor<1x64xi32, #warp> -> tensor<1x16xi32>
-    // CHECK: %[[BC22:.*]] = triton_intel_gpu.broadcast %[[EX2]] : tensor<1x16xi32> -> tensor<16x16xi32>
-    // CHECK: %[[EX3:.*]] = triton_intel_gpu.extract %[[ED2]][3] : tensor<1x64xi32, #warp> -> tensor<1x16xi32>
-    // CHECK: %[[BC23:.*]] = triton_intel_gpu.broadcast %[[EX3]] : tensor<1x16xi32> -> tensor<16x16xi32>
+    // CHECK: %[[EX0:.*]] = triton_intel_gpu.extract %[[ED2]][0] : tensor<1x64xi32, #warp> -> tensor<1x16xi32, #warp>
+    // CHECK: %[[BC20:.*]] = triton_intel_gpu.broadcast %[[EX0]] : tensor<1x16xi32, #warp> -> tensor<16x16xi32>
+    // CHECK: %[[EX1:.*]] = triton_intel_gpu.extract %[[ED2]][1] : tensor<1x64xi32, #warp> -> tensor<1x16xi32, #warp>
+    // CHECK: %[[BC21:.*]] = triton_intel_gpu.broadcast %[[EX1]] : tensor<1x16xi32, #warp> -> tensor<16x16xi32>
+    // CHECK: %[[EX2:.*]] = triton_intel_gpu.extract %[[ED2]][2] : tensor<1x64xi32, #warp> -> tensor<1x16xi32, #warp>
+    // CHECK: %[[BC22:.*]] = triton_intel_gpu.broadcast %[[EX2]] : tensor<1x16xi32, #warp> -> tensor<16x16xi32>
+    // CHECK: %[[EX3:.*]] = triton_intel_gpu.extract %[[ED2]][3] : tensor<1x64xi32, #warp> -> tensor<1x16xi32, #warp>
+    // CHECK: %[[BC23:.*]] = triton_intel_gpu.broadcast %[[EX3]] : tensor<1x16xi32, #warp> -> tensor<16x16xi32>
     %5 = triton_intel_gpu.broadcast %3 : tensor<1x64xi32, #warp> -> tensor<16x64xi32, #warp>
 
     // CHECK: arith.addi %[[BC1]], %[[BC20]] : tensor<16x16xi32>
@@ -563,3 +563,70 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
     tt.return
   }
 }
+
+// -----
+
+// COM: This test checks that the tt.load/tt.advance ops in _both_ loops are detected as being transposed and hence having the 16x16 shape (would be 32x16 otherwise).
+
+#warp = #triton_intel_gpu.warp<{sizePerThread = [16, 64], threadsPerWarp = [1, 1], order = [1, 0]}>
+#warp1 = #triton_intel_gpu.warp<{sizePerThread = [16, 32], threadsPerWarp = [1, 1], order = [1, 0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 1 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
+  tt.func public @_attn_fwd(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: f32, %arg4: !tt.ptr<f32>, %arg5: !tt.ptr<f32>) attributes {noinline = false} {
+    %c16_i32 = arith.constant 16 : i32
+    %c131072_i64 = arith.constant 131072 : i64
+    %c65536_i64 = arith.constant 65536 : i64
+    %c128_i32 = arith.constant 128 : i32
+    %c1024_i64 = arith.constant 1024 : i64
+    %c64_i64 = arith.constant 64 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant 1.44269502 : f32
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x64xf32, #warp>
+    %c64_i32 = arith.constant 64 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %0 = gpu.subgroup_id : index
+    %1 = arith.index_cast %0 : index to i32
+    %2 = tt.get_program_id z : i32
+    %3 = tt.get_program_id x : i32
+    %4 = tt.get_program_id y : i32
+    %5 = arith.extsi %3 : i32 to i64
+    %6 = arith.muli %5, %c131072_i64 : i64
+    %7 = arith.extsi %4 : i32 to i64
+    %8 = arith.muli %7, %c65536_i64 : i64
+    %9 = arith.addi %6, %8 : i64
+    %10 = tt.addptr %arg0, %9 : !tt.ptr<f16>, i64
+    %11 = arith.muli %2, %c128_i32 : i32
+    %12 = arith.muli %1, %c16_i32 : i32
+    %13 = arith.addi %12, %11 : i32
+    %14 = tt.make_tensor_ptr %10, [%c1024_i64, %c64_i64], [%c64_i64, %c1_i64], [%13, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #warp}>>>
+    %28 = tt.addptr %arg1, %9 : !tt.ptr<f16>, i64
+    %34 = tt.make_tensor_ptr %28, [%c64_i64, %c1024_i64], [%c1_i64, %c64_i64], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
+    %35 = tt.addptr %arg5, %9 : !tt.ptr<f32>, i64
+    %36 = tt.make_tensor_ptr %35, [%c1024_i64, %c64_i64], [%c64_i64, %c1_i64], [%13, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x64xf32, #warp>>
+    %44 = tt.load %14 : !tt.ptr<tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #warp}>>>
+    %47:2 = scf.for %arg6 = %c0_i32 to %11 step %c64_i32 iter_args(%arg7 = %cst_0, %arg11 = %34) -> (tensor<16x64xf32, #warp>, !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>)  : i32 {
+      // CHECK-COUNT-16: tt.load {{%.*}} {DotIdx = 1 : i32} : !tt.ptr<tensor<16x16xf16>>
+      %60 = tt.load %arg11 : !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
+      %61 = tt.dot %44, %60, %cst_0, inputPrecision = tf32 : tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #warp}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>> -> tensor<16x64xf32, #warp>
+      // CHECK-COUNT-16: tt.advance {{%.*}}, [%c0_i32, %c64_i32] {DotIdx = 1 : i32} : <tensor<16x16xf16>>
+      %85 = tt.advance %arg11, [%c0_i32, %c64_i32] : <tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
+      scf.yield %61, %85 : tensor<16x64xf32, #warp>, !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
+    } {triton_gpu.workload = 4 : i32, tt.divisibility_arg1 = dense<64> : tensor<1xi32>}
+    // CHECK: gpu.barrier
+    gpu.barrier
+    %48 = arith.muli %2, %c128_i32 {tt.divisibility = dense<128> : tensor<1xi32>} : i32
+    %49 = arith.addi %2, %c1_i32 : i32
+    %50 = arith.muli %49, %c128_i32 : i32
+    %51 = tt.advance %34, [%c0_i32, %48] : <tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
+    %56:2 = scf.for %arg6 = %48 to %50 step %c64_i32 iter_args(%arg7 = %47#0, %arg11 = %51) -> (tensor<16x64xf32, #warp>, !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>)  : i32 {
+      // CHECK-COUNT-16: tt.load {{%.*}} {DotIdx = 1 : i32} : !tt.ptr<tensor<16x16xf16>>
+      %60 = tt.load %arg11 : !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
+      %61 = tt.dot %44, %60, %cst_0, inputPrecision = tf32 : tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #warp}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>> -> tensor<16x64xf32, #warp>
+      // CHECK-COUNT-16: tt.advance {{%.*}}, [%c0_i32, %c64_i32] {DotIdx = 1 : i32} : <tensor<16x16xf16>>
+      %88 = tt.advance %arg11, [%c0_i32, %c64_i32] : <tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
+      scf.yield %61, %88 : tensor<16x64xf32, #warp>, !tt.ptr<tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #warp}>>>
+    } {triton_gpu.workload = 4 : i32}
+    tt.store %36, %56#0 : !tt.ptr<tensor<16x64xf32, #warp>>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MatchTargetSize.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MatchTargetSize.cpp
@@ -149,7 +149,6 @@ static tt::LoadOp findUsedLoad(Value val) {
 }
 
 static bool getTransposeFlagFromValue(Value val) {
-  bool isTransposed = false;
   Value loadPtr = val;
   // backward: from dot operands to tt.load
   if (llvm::any_of(val.getUsers(),
@@ -167,23 +166,26 @@ static bool getTransposeFlagFromValue(Value val) {
   if (auto blockArg = dyn_cast<BlockArgument>(loadPtr)) {
     unsigned argIdx = blockArg.getArgNumber();
     if (auto loopLikeOp = dyn_cast<LoopLikeOpInterface>(
-            blockArg.getParentBlock()->getParentOp())) {
-      auto inits = llvm::to_vector(loopLikeOp.getInits());
-      if (auto glueOp = inits[argIdx - 1].getDefiningOp<ttgi::GlueOp>()) {
-        if (auto tempPtr =
-                glueOp.getOperands()[0].getDefiningOp<tt::MakeTensorPtrOp>()) {
-          loadPtr = tempPtr.getResult();
-        }
-      }
-    }
+            blockArg.getParentBlock()->getParentOp()))
+      loadPtr = loopLikeOp.getInits()[argIdx - 1];
+  }
+
+  if (auto glueOp = loadPtr.getDefiningOp<ttgi::GlueOp>()) {
+    if (isa_and_present<tt::MakeTensorPtrOp, tt::AdvanceOp>(
+            glueOp.getOperands()[0].getDefiningOp()))
+      loadPtr = glueOp.getOperands()[0];
   }
 
   if (auto tensorPtr = loadPtr.getDefiningOp<tt::MakeTensorPtrOp>()) {
     ArrayRef<int32_t> order = tensorPtr.getOrder();
     auto rank = order.size();
-    isTransposed = (order[rank - 2] != 1);
+    return (order[rank - 2] != 1);
   }
-  return isTransposed;
+
+  if (auto advOp = loadPtr.getDefiningOp<tt::AdvanceOp>())
+    return getTransposeFlagFromValue(advOp.getPtr());
+
+  return false;
 }
 
 static void rewriteLoadWithSLM(ModuleOp &m, DenseSet<Value> &dotWithSLMOperands,
@@ -275,6 +277,14 @@ class MatchTargetSizePass
     MLIRContext *ctx = &getContext();
     ModuleOp m = getOperation();
 
+    // By default, tritongpu are lowered to simt mode (threads-per-warp=16)
+    // instead of simd mode (threads-per-warp=1).
+    // FIXME: force threads-per-warp=16 in simt(this should be done via an
+    // analysis designed to determine whether the kernel contains tt.dot
+    // operations that use block pointers).
+    m->setAttr("triton_gpu.threads-per-warp",
+               IntegerAttr::get(IntegerType::get(ctx, 32), 16));
+
     Workload workload = Workload::None;
     m.walk([&](scf::ForOp forOp) {
       if (Attribute attr = forOp->getAttr(AttrWorkloadName))
@@ -352,14 +362,6 @@ class MatchTargetSizePass
     canonicalize();
     LLVM_DEBUG(llvm::dbgs() << "Module after canonicalization:\n"
                             << m << "\n\n");
-
-    // By default, tritongpu are lowered to simt mode (threads-per-warp=16)
-    // instead of simd mode (threads-per-warp=1).
-    // FIXME: force threads-per-warp=16 in simt(this should be done via an
-    // analysis designed to determine whether the kernel contains tt.dot
-    // operations that use block pointers).
-    m->setAttr("triton_gpu.threads-per-warp",
-               IntegerAttr::get(IntegerType::get(ctx, 32), 16));
   }
 
 private:
@@ -379,8 +381,8 @@ class MatchTargetSizePass
                                     bool isTransposed) const;
 
   std::tuple<SmallVector<int64_t>, Type, SmallVector<int64_t>>
-  getSubTypeAndShape(Type type, bool isTransposed = false,
-                     bool useSLM = false) const;
+  getSubTypeAndShape(Type type, bool isTransposed = false, bool useSLM = false,
+                     bool keepEncoding = false) const;
 
   Value getSubVal(Operation *op, Value val, ArrayRef<int64_t> srcOffset,
                   ArrayRef<int64_t> dstSize);
@@ -753,7 +755,7 @@ MatchTargetSizePass::getSubOpSize(RankedTensorType type,
 /// return [shape, subType, subSize] for a tensor (or pointer to tensor)
 std::tuple<SmallVector<int64_t>, Type, SmallVector<int64_t>>
 MatchTargetSizePass::getSubTypeAndShape(Type type, bool isTransposed,
-                                        bool useSLM) const {
+                                        bool useSLM, bool keepEncoding) const {
   if (auto tensorType = dyn_cast<RankedTensorType>(type)) {
     Attribute layout = tensorType.getEncoding();
     assert(layout && "Expecting a valid layout");
@@ -771,15 +773,16 @@ MatchTargetSizePass::getSubTypeAndShape(Type type, bool isTransposed,
       subSize[1] = std::min(subSize[1], shape[1]);
     }
 
-    auto subType = RankedTensorType::get(
-        subSize, tensorType.getElementType() /*no encoding*/);
+    auto subType = RankedTensorType::get(subSize, tensorType.getElementType(),
+                                         keepEncoding ? tensorType.getEncoding()
+                                                      : Attribute{});
     return {shape, subType, subSize};
   }
 
   if (auto ptrType = dyn_cast<tt::PointerType>(type)) {
     Type pointeeType = ptrType.getPointeeType();
     auto [shape, subType, subSize] =
-        getSubTypeAndShape(pointeeType, isTransposed, useSLM);
+        getSubTypeAndShape(pointeeType, isTransposed, useSLM, keepEncoding);
     auto newType = tt::PointerType::get(subType, ptrType.getAddressSpace());
     return {shape, newType, subSize};
   }
@@ -1186,8 +1189,11 @@ void MatchTargetSizePass::transformBroadcastOp(ttgi::BroadcastOp op) {
     glue = b.create<ttgi::GlueOp>(loc, resType, ops);
   } else if (srcDim0 == 1 && srcDim1 == resDim1) {
     // Handle row-vector broadcasts, e.g. 1x64 --> 16x64.
+    // This kind of broadcast requires that the tensor type is kept intact by
+    // SIMT lowering, hence propagate the encoding here.
     auto subRowVecTy =
-        RankedTensorType::get({1, tType.getShape()[1]}, tType.getElementType());
+        RankedTensorType::get({1, tType.getShape()[1]}, tType.getElementType(),
+                              srcType.getEncoding());
 
     // How many extracts do we need to cover the width of the input tensor?
     unsigned nExtracts = srcDim1 / dstDim1;
@@ -1222,9 +1228,10 @@ void MatchTargetSizePass::transformMakeRangeOp(tt::MakeRangeOp op) {
 
   unsigned start = op.getStart();
   unsigned end = op.getEnd();
-  assert(start == 0 && end % subgroupSize == 0 && "Unsupported range");
+  assert(start == 0 && (end <= subgroupSize || end % subgroupSize == 0) &&
+         "Unsupported range");
 
-  if (end == subgroupSize)
+  if (end <= subgroupSize)
     // nothing to do
     return;
 
@@ -1240,6 +1247,7 @@ void MatchTargetSizePass::transformMakeRangeOp(tt::MakeRangeOp op) {
   Location loc = op.getLoc();
   RankedTensorType origTy = op.getType();
   Type elemTy = origTy.getElementType();
+  // Propagate encoding to keep tensor during SIMT lowering.
   auto subRangeTy =
       RankedTensorType::get({subgroupSize}, elemTy, origTy.getEncoding());
   auto subRange = b.create<tt::MakeRangeOp>(loc, subRangeTy, 0, subgroupSize);
@@ -1310,8 +1318,16 @@ void MatchTargetSizePass::transformGenericOp(Operation *op) {
         cast<tt::PointerType>(load.getPtr().getType()).getAddressSpace();
     useSLM = (ptrAS == TritonGEN::TritonGENMemorySpace::kWorkgroup);
   }
+
+  // Keep encoding on certain tensors to leave them untouched during SIMT
+  // lowering. Currently, this is required for "row vectors" (= `tensor<1xN>`).
+  bool keepEncoding = false;
+  if (auto tensorType = dyn_cast<RankedTensorType>(type)) {
+    ArrayRef<int64_t> shape = tensorType.getShape();
+    keepEncoding = shape.size() == 2 && shape[0] == 1 && shape[1] > 1;
+  }
   auto [shape, subType, subSize] =
-      getSubTypeAndShape(type, isTransposed, useSLM);
+      getSubTypeAndShape(type, isTransposed, useSLM, keepEncoding);
 
   unsigned dim = shape.size();
   OpBuilder b(op);
@@ -1328,8 +1344,8 @@ void MatchTargetSizePass::transformGenericOp(Operation *op) {
                         [&](Value operand) {
                           Type type = operand.getType();
                           if (isa<tt::PointerType, RankedTensorType>(type)) {
-                            Type subOpndType = std::get<1>(
-                                getSubTypeAndShape(type, isTransposed, useSLM));
+                            Type subOpndType = std::get<1>(getSubTypeAndShape(
+                                type, isTransposed, useSLM, keepEncoding));
                             Value newOp = b.create<ttgi::ExtractOp>(
                                 loc, subOpndType, operand, idx);
                             return newOp;