Skip to content

Commit 131c537

Browse files
victor-edsetiotto
andauthored
[XPU][TritonIntelGPUToLLVM] Add support for more transpose kinds (#2786)
Add support for layout conversion transposes in which rows managed by a single thread are contiguous in the output matrix. --------- Signed-off-by: victor-eds <[email protected]> Co-authored-by: Ettore Tiotto <[email protected]>
1 parent 49b1c1a commit 131c537

File tree

4 files changed

+204
-32
lines changed

4 files changed

+204
-32
lines changed

test/Conversion/intel/intel-allocate-shared-memory.mlir

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
8181
tt.return %0 : tensor<128x64xf32, #blocked1>
8282
}
8383
}
84+
85+
// -----
86+
87+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [16, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
88+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [16, 2], threadsPerWarp = [1, 16], warpsPerCTA = [2, 4], order = [0, 1]}>
89+
90+
// Check scracth memory configuration for different sub-group transpose-like layout conversions.
91+
92+
// CHECK-LABEL: module attributes
93+
// CHECK-SAME: triton_gpu.shared = 17408 : i32
94+
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
95+
tt.func @test_contiguous(%arg0: tensor<32x128xf32, #blocked>) -> tensor<32x128xf32, #blocked1> {
96+
%0 = triton_gpu.convert_layout %arg0 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #blocked1>
97+
tt.return %0 : tensor<32x128xf32, #blocked1>
98+
}
99+
}

0 commit comments

Comments
 (0)