Skip to content

Commit 642062d

Browse files
committed
[XPU][TritonIntelGPUToLLVM] Add support for more transpose kinds
Add support for layout conversion transposes in which rows managed by a single thread are contiguous in the output matrix. Signed-off-by: victor-eds <[email protected]>
1 parent f4f3589 commit 642062d

File tree

4 files changed

+204
-32
lines changed

4 files changed

+204
-32
lines changed

test/Conversion/intel/intel-allocate-shared-memory.mlir

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 :
6363
tt.return %0 : tensor<128x64xf32, #blocked1>
6464
}
6565
}
66+
67+
// -----
68+
69+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [16, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
70+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [16, 2], threadsPerWarp = [1, 16], warpsPerCTA = [2, 4], order = [0, 1]}>
71+
72+
// Check scracth memory configuration for different sub-group transpose-like layout conversions.
73+
74+
// CHECK-LABEL: module attributes
75+
// CHECK-SAME: triton_gpu.shared = 17408 : i32
76+
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
77+
tt.func @test_contiguous(%arg0: tensor<32x128xf32, #blocked>) -> tensor<32x128xf32, #blocked1> {
78+
%0 = triton_gpu.convert_layout %arg0 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #blocked1>
79+
tt.return %0 : tensor<32x128xf32, #blocked1>
80+
}
81+
}

0 commit comments

Comments
 (0)