Skip to content

Commit f79d2a2

Browse files
committed
add some tests
1 parent cf23eaf commit f79d2a2

File tree

2 files changed

+40
-25
lines changed

2 files changed

+40
-25
lines changed

mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,21 @@ namespace xegpu {
3535

3636
using namespace mlir;
3737

38+
struct TransposableBlockRange {
39+
int minWidth, maxWidth, minHeight, maxHeight;
40+
};
41+
42+
// TODO: Use uArch to get supported block ranges.
43+
static TransposableBlockRange getBlockRange(int bitWidth) {
44+
switch (bitWidth) {
45+
case 32:
46+
return {/**min width**/ 1, /**max width**/ 8, /**min height**/ 1,
47+
/**max height**/ 32};
48+
default:
49+
llvm_unreachable("Add support for other element bitwidths");
50+
}
51+
}
52+
3853
namespace {
3954

4055
static std::optional<SmallVector<int64_t>>

mlir/test/Dialect/XeGPU/optimize-transpose.mlir

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,31 @@ func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf1
5151
return
5252
}
5353

54+
// -----
55+
#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
56+
#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
57+
#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
58+
func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
59+
%c0 = arith.constant 0 : index
60+
%c16 = arith.constant 16 : index
61+
%c256 = arith.constant 256 : index
62+
scf.for %arg8 = %c0 to %c256 step %c16 {
63+
%0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
64+
%1 = xegpu.load_nd %0[%arg8, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
65+
%2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a>
66+
%3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
67+
%4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) {
68+
%5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16>
69+
%6 = xegpu.load_nd %3[%arg8, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
70+
%7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
71+
%8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
72+
scf.yield %8 : vector<8x16xf32>
73+
} {layout_result_0 = #a}
74+
xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
75+
}
76+
return
77+
}
78+
5479
// -----
5580
#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
5681
#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
@@ -90,28 +115,3 @@ func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg
90115
xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
91116
return
92117
}
93-
94-
// -----
95-
#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
96-
#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
97-
#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
98-
func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
99-
%c0 = arith.constant 0 : index
100-
%c16 = arith.constant 16 : index
101-
%c256 = arith.constant 256 : index
102-
scf.for %arg8 = %c0 to %c256 step %c16 {
103-
%0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
104-
%1 = xegpu.load_nd %0[%arg8, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
105-
%2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a>
106-
%3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
107-
%4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) {
108-
%5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16>
109-
%6 = xegpu.load_nd %3[%arg8, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
110-
%7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
111-
%8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
112-
scf.yield %8 : vector<8x16xf32>
113-
} {layout_result_0 = #a}
114-
xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
115-
}
116-
return
117-
}

0 commit comments

Comments
 (0)