[XPU][Alloc] Optimize SLM allocation size for sub-group layout conversions (#2638)

victor-eds · web-flow · commit 16f573842925 · 2024-11-12T09:18:40.000+01:00
Optimize shared memory allocation sizes for sub-group shuffle and
transpose-like conversions:

- Sub-group shuffle: Do not allocate memory at all.
- Sub-group transpose: Allocate as much memory needed to store the whole
tensor in SLM.

---------

Signed-off-by: victor-eds &lt;victor.perez@codeplay.com&gt;
diff --git a/test/Conversion/intel/intel-allocate-shared-memory.mlir b/test/Conversion/intel/intel-allocate-shared-memory.mlir
@@ -0,0 +1,65 @@
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check no scratch memory is allocated for sub-group shuffle-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 0 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  // CHECK: tt.func @test_sub_group_shuffle
+  // CHECK-NOT: llvm.ptr<3>
+  tt.func @test_sub_group_shuffle(%arg0: tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    tt.return %0 : tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 512 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f16(%arg0: tensor<16x16xf16, #blocked>) -> tensor<16x16xf16, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #blocked1>
+    tt.return %0 : tensor<16x16xf16, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 1024 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f32(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16x16xf32, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #blocked1>
+    tt.return %0 : tensor<16x16xf32, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 2], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 32768 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f32(%arg0: tensor<128x64xf32, #blocked>) -> tensor<128x64xf32, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<128x64xf32, #blocked> -> tensor<128x64xf32, #blocked1>
+    tt.return %0 : tensor<128x64xf32, #blocked1>
+  }
+}
diff --git a/test/Conversion/intel/sub-group-shuffle.mlir b/test/Conversion/intel/sub-group-shuffle.mlir
@@ -9,7 +9,7 @@
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_f16(
-  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(f16)>,
+  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(f16)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f16)>
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK:           llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_4]])
@@ -49,7 +49,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_bf16(
-  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(bf16)>,
+  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(bf16)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(bf16)>
   // CHECK:           %[[VAL_2:.*]] = llvm.bitcast %[[VAL_1]] : bf16 to i16
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -91,7 +91,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_i1(
-  // CHECK-SAME:                                     %[[VAL_0:.*]]: !llvm.struct<(i1)>,
+  // CHECK-SAME:                                     %[[VAL_0:.*]]: !llvm.struct<(i1)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(i1)>
   // CHECK:           %[[VAL_2:.*]] = llvm.zext %[[VAL_1]] : i1 to i8
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -133,7 +133,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_ptr(
-  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>,
+  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(ptr<1>)>
   // CHECK:           %[[VAL_2:.*]] = llvm.ptrtoint %[[VAL_1]] : !llvm.ptr<1> to i64
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -186,7 +186,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_f32(
-  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(f32)>,
+  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(f32)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32)>
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK:           llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_4]])
@@ -269,7 +269,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_non_sliced_multi_register(
-  // CHECK-SAME:                                                            %[[VAL_0:.*]]: !llvm.struct<(f64, f64)>,
+  // CHECK-SAME:                                                            %[[VAL_0:.*]]: !llvm.struct<(f64, f64)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f64, f64)>
   // CHECK:           %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f64, f64)>
   // CHECK:           %[[VAL_5:.*]] = llvm.mlir.constant(0 : i32) : i32
diff --git a/third_party/intel/lib/Analysis/Allocation.cpp b/third_party/intel/lib/Analysis/Allocation.cpp
@@ -15,6 +15,8 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "llvm/ADT/SmallVector.h"
 
+#include "intel/include/Analysis/Utility.h"
+
 using ::mlir::triton::gpu::AMDMfmaEncodingAttr;
 using ::mlir::triton::gpu::BlockedEncodingAttr;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
@@ -104,6 +106,26 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
 
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy) {
+  if (gpu::intel::cvtIsSubGroupShuffle(srcTy, dstTy)) {
+    // Conversions that can be implemented as sub-group shuffles do not need
+    // scratch memory.
+    return ScratchConfig({}, {});
+  }
+
+  if (gpu::intel::cvtIsSubGroupTranspose(srcTy, dstTy)) {
+    // Conversions that can be implemented as sub-group transposes store the
+    // whole tensor in shared memory and read it afterwards.
+    auto srcEncoding = cast<gpu::DistributedEncodingTrait>(srcTy.getEncoding());
+    unsigned threadsPerWarp = product(srcEncoding.getThreadsPerWarp());
+    unsigned warpsPerCTA = product(srcEncoding.getWarpsPerCTA());
+    unsigned remaining = product(srcTy.getShape()) /
+                         (threadsPerWarp * threadsPerWarp * warpsPerCTA);
+    SmallVector<unsigned> repShape{threadsPerWarp, threadsPerWarp, remaining,
+                                   warpsPerCTA};
+    return ScratchConfig(repShape, repShape,
+                         /*inVec=*/1, /*outVec=*/threadsPerWarp);
+  }
+
   // Initialize vector sizes and stride
   auto repShape = getRepShapeForCvt(srcTy, dstTy);
   if (repShape.empty())