Skip to content

Commit b6cd04c

Browse files
committed
[XPU][TritonGPUToLLVM] Avoid bank conflicts in sub-group transposes
- Store the whole matrix using SIMD block stores for each row leaving a single garbage item at the end of the row so each row has `sub_group_size + 1` elements - Load each row with vector loads By introducing this garbage item at the end of each row, we ensure matrix loading avoid bank conflicts as the offset between the position loaded by work-item `i` and `i+j` is `N * (sub_group_size + 1)` (assuming `sub_group_size` banks). Signed-off-by: victor-eds <[email protected]>
1 parent 7551a90 commit b6cd04c

File tree

4 files changed

+440
-242
lines changed

4 files changed

+440
-242
lines changed

test/Conversion/intel/intel-allocate-shared-memory.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
2424
// Check scracth memory configuration for different sub-group transpose-like layout conversions.
2525

2626
// CHECK-LABEL: module attributes
27-
// CHECK-SAME: triton_gpu.shared = 512 : i32
27+
// CHECK-SAME: triton_gpu.shared = 544 : i32
2828
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
2929
tt.func @test_f16(%arg0: tensor<16x16xf16, #blocked>) -> tensor<16x16xf16, #blocked1> {
3030
%0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #blocked1>
@@ -40,7 +40,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
4040
// Check scracth memory configuration for different sub-group transpose-like layout conversions.
4141

4242
// CHECK-LABEL: module attributes
43-
// CHECK-SAME: triton_gpu.shared = 1024 : i32
43+
// CHECK-SAME: triton_gpu.shared = 1088 : i32
4444
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
4545
tt.func @test_f32(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16x16xf32, #blocked1> {
4646
%0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #blocked1>
@@ -56,7 +56,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
5656
// Check scracth memory configuration for different sub-group transpose-like layout conversions.
5757

5858
// CHECK-LABEL: module attributes
59-
// CHECK-SAME: triton_gpu.shared = 32768 : i32
59+
// CHECK-SAME: triton_gpu.shared = 34816 : i32
6060
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
6161
tt.func @test_f32(%arg0: tensor<128x64xf32, #blocked>) -> tensor<128x64xf32, #blocked1> {
6262
%0 = triton_gpu.convert_layout %arg0 : tensor<128x64xf32, #blocked> -> tensor<128x64xf32, #blocked1>

0 commit comments

Comments
 (0)