Skip to content

Commit 6c6d175

Browse files
Still carrying two reverts as previous integrates iree-org#22366 New Revert: - llvm/llvm-project#163994 causes a lit test failure on windows_x64_msvc Fixes: - Remove UnsafeFPmath option from llvm target options (See iree-org#22374 (comment)) - Update mfma/wmma/scaled_mfma assembly format with intrinsic shape (See llvm/llvm-project@f248010, llvm/llvm-project@dc5f274, llvm/llvm-project@bbe9209) - Deprecate StringSwitch Cases with 3+ args (See llvm/llvm-project@3526bb0) - Deprecate OpBuilder::create (See llvm/llvm-project@c0b42ec) ci-extra: test_torch --------- Signed-off-by: Muzammiluddin Syed <[email protected]> Signed-off-by: Yu-Zhewen <[email protected]> Co-authored-by: Muzammiluddin Syed <[email protected]>
1 parent 2294bb8 commit 6c6d175

File tree

14 files changed

+113
-132
lines changed

14 files changed

+113
-132
lines changed

compiler/plugins/target/ROCM/ROCMTarget.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,6 @@ class ROCMTargetBackend final : public TargetBackend {
639639
}
640640
llvm::TargetOptions opt;
641641
opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
642-
opt.UnsafeFPMath = false;
643642
opt.NoInfsFPMath = false;
644643
opt.NoNaNsFPMath = true;
645644
// Be extra cautious while this is less tested, and prevent unknown

compiler/src/iree/compiler/API/Internal/LLDToolEntryPoint.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ LLD_HAS_DRIVER(wasm)
5959

6060
static Flavor getFlavor(StringRef s) {
6161
return StringSwitch<Flavor>(s)
62-
.CasesLower("ld", "ld.lld", "gnu", Gnu)
63-
.CasesLower("wasm", "ld-wasm", Wasm)
62+
.CasesLower({"ld", "ld.lld", "gnu"}, Gnu)
63+
.CasesLower({"wasm", "ld-wasm"}, Wasm)
6464
.CaseLower("link", WinLink)
6565
.CasesLower(
6666
{"ld64", "ld64.lld", "darwin", "darwinnew", "ld64.lld.darwinnew"},

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,7 @@ builtin.module attributes { transform.with_named_sequence } {
7777
// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x4xf16> to vector<4xf16>
7878
// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x4x1xf16> to vector<4xf16>
7979
// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<4x1x4x1xf32> to vector<16xf32>
80-
// CHECK: %[[MFMA:.+]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
81-
// CHECK-SAME: {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none
80+
// CHECK: %[[MFMA:.+]] = amdgpu.mfma 32x32x8 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp = none
8281
// CHECK-SAME: : vector<4xf16>, vector<4xf16>, vector<16xf32>
8382
// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<16xf32> to vector<4x1x4x1xf32>
8483
// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
@@ -154,8 +153,7 @@ builtin.module attributes { transform.with_named_sequence } {
154153
// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x4xf16> to vector<4xf16>
155154
// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x4x1xf16> to vector<4xf16>
156155
// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x4x1xf32> to vector<4xf32>
157-
// CHECK: %[[MFMA:.+]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
158-
// CHECK-SAME: {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none
156+
// CHECK: %[[MFMA:.+]] = amdgpu.mfma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp = none
159157
// CHECK-SAME: : vector<4xf16>, vector<4xf16>, vector<4xf32>
160158

161159
// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<4xf32> to vector<1x1x4x1xf32>
@@ -238,13 +236,13 @@ builtin.module attributes { transform.with_named_sequence } {
238236
// CHECK: %[[A_SLICE0:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x4xf16> from vector<2x1x1x1x1x4xf16>
239237
// CHECK: %[[A0_CAST:.+]] = vector.shape_cast %[[A_SLICE0]] : vector<1x1x1x4xf16> to vector<4xf16>
240238
// CHECK: %[[C0_CAST:.+]] = vector.shape_cast %[[C_SLICE0]] : vector<4x1x4x1xf32> to vector<16xf32>
241-
// CHECK: %[[MFMA0:.+]] = amdgpu.mfma %[[A0_CAST]] * %{{.+}} + %[[C0_CAST]]
239+
// CHECK: %[[MFMA0:.+]] = amdgpu.mfma 32x32x8 %[[A0_CAST]] * %{{.+}} + %[[C0_CAST]]
242240
// CHECK: %[[R0_CAST:.+]] = vector.shape_cast %[[MFMA0]] : vector<16xf32> to vector<4x1x4x1xf32>
243241
// CHECK: %[[C_SLICE1:.+]] = vector.extract %[[C_SIMT]][1, 0] : vector<4x1x4x1xf32> from vector<2x1x4x1x4x1xf32>
244242
// CHECK: %[[A_SLICE1:.+]] = vector.extract %[[A_SIMT]][1, 0] : vector<1x1x1x4xf16> from vector<2x1x1x1x1x4xf16>
245243
// CHECK: %[[A1_CAST:.+]] = vector.shape_cast %[[A_SLICE1]] : vector<1x1x1x4xf16> to vector<4xf16>
246244
// CHECK: %[[C1_CAST:.+]] = vector.shape_cast %[[C_SLICE1]] : vector<4x1x4x1xf32> to vector<16xf32>
247-
// CHECK: %[[MFMA1:.+]] = amdgpu.mfma %[[A1_CAST]] * %{{.+}} + %[[C1_CAST]]
245+
// CHECK: %[[MFMA1:.+]] = amdgpu.mfma 32x32x8 %[[A1_CAST]] * %{{.+}} + %[[C1_CAST]]
248246
// CHECK: %[[R1_CAST:.+]] = vector.shape_cast %[[MFMA1]] : vector<16xf32> to vector<4x1x4x1xf32>
249247
// CHECK: %[[R0:.+]]:16 = vector.to_elements %[[R0_CAST]] : vector<4x1x4x1xf32>
250248
// CHECK: %[[R1:.+]]:16 = vector.to_elements %[[R1_CAST]] : vector<4x1x4x1xf32>
@@ -329,12 +327,12 @@ builtin.module attributes { transform.with_named_sequence } {
329327
// CHECK: %[[B_SLICE0:.+]] = vector.extract %[[B_SIMT]][0, 0]
330328
// CHECK: %[[A0_CAST:.+]] = vector.shape_cast %[[A_SLICE0]]
331329
// CHECK: %[[B0_CAST:.+]] = vector.shape_cast %[[B_SLICE0]]
332-
// CHECK: %[[MFMA0:.+]] = amdgpu.mfma %[[A0_CAST]] * %[[B0_CAST]] + %{{.+}}
330+
// CHECK: %[[MFMA0:.+]] = amdgpu.mfma 32x32x8 %[[A0_CAST]] * %[[B0_CAST]] + %{{.+}}
333331
// CHECK: %[[A_SLICE1:.+]] = vector.extract %[[A_SIMT]][0, 1]
334332
// CHECK: %[[B_SLICE1:.+]] = vector.extract %[[B_SIMT]][1, 0]
335333
// CHECK: %[[A1_CAST:.+]] = vector.shape_cast %[[A_SLICE1]]
336334
// CHECK: %[[B1_CAST:.+]] = vector.shape_cast %[[B_SLICE1]]
337-
// CHECK: %[[MFMA1:.+]] = amdgpu.mfma %[[A1_CAST]] * %[[B1_CAST]] + %[[MFMA0]]
335+
// CHECK: %[[MFMA1:.+]] = amdgpu.mfma 32x32x8 %[[A1_CAST]] * %[[B1_CAST]] + %[[MFMA0]]
338336
// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA1]]
339337

340338
// -----
@@ -584,7 +582,7 @@ builtin.module attributes { transform.with_named_sequence } {
584582
// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x16xf16> to vector<16xf1
585583
// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x16x1xf16> to vector<16xf1
586584
// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<8x1x1x1xf32> to vector<8xf32>
587-
// CHECK: %[[WMMA:.+]] = amdgpu.wmma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
585+
// CHECK: %[[WMMA:.+]] = amdgpu.wmma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
588586
// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<8x1x1x1xf32>
589587
// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<8x1x1x1xf32> to vector<1x1x8x1x1x1xf32>
590588
// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x8x1x1x1xf32> -> vector<16x16xf32>
@@ -670,7 +668,7 @@ builtin.module attributes { transform.with_named_sequence } {
670668
// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x8xf16> to vector<8xf16>
671669
// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x8x1xf16> to vector<8xf16>
672670
// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
673-
// CHECK: %[[WMMA:.+]] = amdgpu.wmma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
671+
// CHECK: %[[WMMA:.+]] = amdgpu.wmma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
674672
// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
675673
// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
676674
// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
@@ -756,13 +754,11 @@ builtin.module attributes { transform.with_named_sequence } {
756754
// CHECK: %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<4x1x4x1xf32> to vector<16xf32>
757755
// CHECK: %[[A_SLICE_0:.+]] = vector.extract_strided_slice %[[A_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
758756
// CHECK: %[[B_SLICE_0:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
759-
// CHECK: %[[MFMA_0:.*]] = amdgpu.mfma %[[A_SLICE_0]] * %[[B_SLICE_0]] + %[[C_CAST]]
760-
// CHECK-SAME: {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none
757+
// CHECK: %[[MFMA_0:.*]] = amdgpu.mfma 32x32x8 %[[A_SLICE_0]] * %[[B_SLICE_0]] + %[[C_CAST]] blgp = none
761758
// CHECK-SAME: : vector<4xf16>, vector<4xf16>, vector<16xf32>
762759
// CHECK: %[[A_SLICE_1:.+]] = vector.extract_strided_slice %[[A_CAST]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
763760
// CHECK: %[[B_SLICE_1:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
764-
// CHECK: %[[MFMA_1:.+]] = amdgpu.mfma %[[A_SLICE_1]] * %[[B_SLICE_1]] + %[[MFMA_0]]
765-
// CHECK-SAME: {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none
761+
// CHECK: %[[MFMA_1:.+]] = amdgpu.mfma 32x32x8 %[[A_SLICE_1]] * %[[B_SLICE_1]] + %[[MFMA_0]] blgp = none
766762
// CHECK-SAME: : vector<4xf16>, vector<4xf16>, vector<16xf32>
767763
// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_1]] : vector<16xf32> to vector<4x1x4x1xf32>
768764
// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
@@ -831,20 +827,16 @@ builtin.module attributes { transform.with_named_sequence } {
831827
// CHECK: %[[A_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
832828
// CHECK: %[[B_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
833829
// CHECK: %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1xf32> to vector<4xf32>
834-
// CHECK: %[[MFMA_0:.*]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
835-
// CHECK-SAME: {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp = none
830+
// CHECK: %[[MFMA_0:.*]] = amdgpu.mfma 16x16x32 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp = none
836831
// CHECK: %[[A_CAST_1:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
837832
// CHECK: %[[B_CAST_1:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
838-
// CHECK: %[[MFMA_1:.*]] = amdgpu.mfma %[[A_CAST_1]] * %[[B_CAST_1]] + %[[MFMA_0]]
839-
// CHECK-SAME: {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp = none
833+
// CHECK: %[[MFMA_1:.*]] = amdgpu.mfma 16x16x32 %[[A_CAST_1]] * %[[B_CAST_1]] + %[[MFMA_0]] blgp = none
840834
// CHECK: %[[MFMA_1_CAST:.*]] = vector.shape_cast %[[MFMA_1]] : vector<4xf32> to vector<1x1x4x1xf32>
841835
// CHECK: %[[B_CAST_2:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
842836
// CHECK: %[[C_CAST_1:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1xf32> to vector<4xf32>
843-
// CHECK: %[[MFMA_2:.*]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST_2]] + %[[C_CAST_1]]
844-
// CHECK-SAME: {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp = none
837+
// CHECK: %[[MFMA_2:.*]] = amdgpu.mfma 16x16x32 %[[A_CAST]] * %[[B_CAST_2]] + %[[C_CAST_1]] blgp = none
845838
// CHECK: %[[B_CAST_3:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
846-
// CHECK: %[[MFMA_3:.*]] = amdgpu.mfma %[[A_CAST_1]] * %[[B_CAST_3]] + %[[MFMA_2]]
847-
// CHECK-SAME: {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp = none
839+
// CHECK: %[[MFMA_3:.*]] = amdgpu.mfma 16x16x32 %[[A_CAST_1]] * %[[B_CAST_3]] + %[[MFMA_2]] blgp = none
848840
// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_3]] : vector<4xf32> to vector<1x1x4x1xf32>
849841
// CHECK: %[[R0:.+]]:4 = vector.to_elements %[[MFMA_1_CAST]] : vector<1x1x4x1xf32>
850842
// CHECK: %[[R1:.+]]:4 = vector.to_elements %[[R_CAST]] : vector<1x1x4x1xf32>

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,8 @@ static Value createMmaOp(OpBuilder &builder, Location loc,
671671
.getResult();
672672
}
673673
if (is_AMD_WMMA(intrinsic)) {
674-
return amdgpu::WMMAOp::create(builder, loc, resultType, lhs, rhs, acc)
674+
return amdgpu::WMMAOp::create(builder, loc, resultType, layout.mSize,
675+
layout.nSize, layout.kSize, lhs, rhs, acc)
675676
.getResult();
676677
}
677678
return {};

compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_inner_tiled.mlir

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ module attributes { transform.with_named_sequence } {
2929
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: vector<4xf16>
3030
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: vector<4xf16>
3131
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: vector<4xf32>
32-
// CHECK: amdgpu.mfma %[[LHS]] * %[[RHS]] + %[[ACC]]
33-
// CHECK-SAME: blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
32+
// CHECK: amdgpu.mfma 16x16x16 %[[LHS]] * %[[RHS]] + %[[ACC]]
3433
// CHECK-SAME: blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
3534

3635
// -----
@@ -64,8 +63,7 @@ module attributes { transform.with_named_sequence } {
6463
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: vector<4xf16>
6564
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: vector<4xf16>
6665
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: vector<16xf32>
67-
// CHECK: amdgpu.mfma %[[LHS]] * %[[RHS]] + %[[ACC]]
68-
// CHECK-SAME: blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32
66+
// CHECK: amdgpu.mfma 32x32x8 %[[LHS]] * %[[RHS]] + %[[ACC]]
6967
// CHECK-SAME: blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>
7068

7169
// -----
@@ -99,8 +97,7 @@ module attributes { transform.with_named_sequence } {
9997
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: vector<4xf16>
10098
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: vector<4xf16>
10199
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: vector<16xf32>
102-
// CHECK: amdgpu.mfma %[[RHS]] * %[[LHS]] + %[[ACC]]
103-
// CHECK-SAME: blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32
100+
// CHECK: amdgpu.mfma 32x32x8 %[[RHS]] * %[[LHS]] + %[[ACC]]
104101
// CHECK-SAME: blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>
105102

106103
// -----
@@ -137,12 +134,10 @@ module attributes { transform.with_named_sequence } {
137134
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: vector<4xf32>
138135
// CHECK: %[[LHS0:.*]] = vector.extract_strided_slice %[[LHS]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
139136
// CHECK: %[[RHS0:.*]] = vector.extract_strided_slice %[[RHS]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
140-
// CHECK: %[[ACC0:.*]] = amdgpu.mfma %[[RHS0]] * %[[LHS0]] + %[[ACC]]
141-
// CHECK-SAME: {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32}
137+
// CHECK: %[[ACC0:.*]] = amdgpu.mfma 16x16x16 %[[RHS0]] * %[[LHS0]] + %[[ACC]]
142138
// CHECK: %[[LHS1:.*]] = vector.extract_strided_slice %[[LHS]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
143139
// CHECK: %[[RHS1:.*]] = vector.extract_strided_slice %[[RHS]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
144-
// CHECK: %[[ACC1:.*]] = amdgpu.mfma %[[RHS1]] * %[[LHS1]] + %[[ACC0]]
145-
// CHECK-SAME: {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32}
140+
// CHECK: %[[ACC1:.*]] = amdgpu.mfma 16x16x16 %[[RHS1]] * %[[LHS1]] + %[[ACC0]]
146141
// CHECK: return %[[ACC1]] : vector<4xf32>
147142

148143
// -----
@@ -176,7 +171,7 @@ module attributes { transform.with_named_sequence } {
176171
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: vector<16xf16>
177172
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: vector<16xf16>
178173
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: vector<8xf32>
179-
// CHECK: amdgpu.wmma %[[LHS]] * %[[RHS]] + %[[ACC]]
174+
// CHECK: amdgpu.wmma 16x16x16 %[[LHS]] * %[[RHS]] + %[[ACC]]
180175
// CHECK-SAME: : vector<16xf16>, vector<16xf16>, vector<8xf32>
181176

182177
// -----
@@ -210,7 +205,7 @@ module attributes { transform.with_named_sequence } {
210205
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: vector<8xf16>
211206
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: vector<8xf16>
212207
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: vector<8xf32>
213-
// CHECK: amdgpu.wmma %[[LHS]] * %[[RHS]] + %[[ACC]]
208+
// CHECK: amdgpu.wmma 16x16x16 %[[LHS]] * %[[RHS]] + %[[ACC]]
214209
// CHECK-SAME: : vector<8xf16>, vector<8xf16>, vector<8xf32>
215210

216211
// -----
@@ -247,8 +242,7 @@ module attributes { transform.with_named_sequence } {
247242
// CHECK-DAG: %[[LHSCAST:.+]] = vector.shape_cast %[[LHS]] : vector<1x4xf16> to vector<4xf16>
248243
// CHECK-DAG: %[[RHSCAST:.+]] = vector.shape_cast %[[RHS]] : vector<4x1xf16> to vector<4xf16>
249244
// CHECK-DAG: %[[ACCCAST:.+]] = vector.shape_cast %[[ACC]] : vector<4x1xf32> to vector<4xf32>
250-
// CHECK: %[[MMA:.+]] = amdgpu.mfma %[[LHSCAST]] * %[[RHSCAST]] + %[[ACCCAST]]
251-
// CHECK-SAME: blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
245+
// CHECK: %[[MMA:.+]] = amdgpu.mfma 16x16x16 %[[LHSCAST]] * %[[RHSCAST]] + %[[ACCCAST]]
252246
// CHECK-SAME: blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
253247
// CHECK: vector.shape_cast %[[MMA]] : vector<4xf32> to vector<4x1xf32>
254248

@@ -296,8 +290,7 @@ module attributes { transform.with_named_sequence } {
296290
// CHECK: %[[LHS_SCALE_LONG:.+]] = vector.insert %[[LHS_SCALE_SCALAR]], %[[CST]] [0]
297291
// CHECK: %[[RHS_SCALE_SCALAR:.+]] = vector.extract %[[RHS_SCALE]][0]
298292
// CHECK: %[[RHS_SCALE_LONG:.+]] = vector.insert %[[RHS_SCALE_SCALAR]], %[[CST]] [0]
299-
// CHECK: amdgpu.scaled_mfma(%[[LHS_SCALE_LONG]][0] * %[[LHS]]) * (%[[RHS_SCALE_LONG]][0] * %[[RHS]]) + %[[ACC]]
300-
// CHECK-SAME: k = 128 : i32, m = 16 : i32, n = 16 : i32
293+
// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[LHS_SCALE_LONG]][0] * %[[LHS]]) * (%[[RHS_SCALE_LONG]][0] * %[[RHS]]) + %[[ACC]]
301294
// CHECK-SAME: vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, vector<4xf32>
302295

303296
// -----
@@ -344,6 +337,5 @@ module attributes { transform.with_named_sequence } {
344337
// CHECK: %[[LHS_SCALE_LONG:.+]] = vector.insert %[[LHS_SCALE_SCALAR]], %[[CST]] [0]
345338
// CHECK: %[[RHS_SCALE_SCALAR:.+]] = vector.extract %[[RHS_SCALE]][0]
346339
// CHECK: %[[RHS_SCALE_LONG:.+]] = vector.insert %[[RHS_SCALE_SCALAR]], %[[CST]] [0]
347-
// CHECK: amdgpu.scaled_mfma(%[[LHS_SCALE_LONG]][0] * %[[LHS]]) * (%[[RHS_SCALE_LONG]][0] * %[[RHS]]) + %[[ACC]]
348-
// CHECK-SAME: k = 64 : i32, m = 32 : i32, n = 32 : i32
340+
// CHECK: amdgpu.scaled_mfma 32x32x64 (%[[LHS_SCALE_LONG]][0] * %[[LHS]]) * (%[[RHS_SCALE_LONG]][0] * %[[RHS]]) + %[[ACC]]
349341
// CHECK-SAME: vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, vector<16xf32>

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,6 @@ hal.executable private @main {
7777
// CHECK: gpu.barrier
7878
// CHECK-DAG: %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x4xf16>
7979
// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x4xf16>
80-
// CHECK-COUNT-4: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
80+
// CHECK-COUNT-4: amdgpu.mfma 16x16x16
8181
// CHECK: vector.transfer_write %{{.*}}, %[[BUF2]]
8282
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}

0 commit comments

Comments
 (0)