Skip to content

Commit cb5d1ab

Browse files
authored
Rename unroll_n_to_subgroups to subgroups_n (iree-org#19102)
"Unroll" usually means "generate more instructions", so the terminology being changed here, `unroll_n_to_subgroups`, created confusion. Signed-off-by: Benoit Jacob <[email protected]>
1 parent e10231c commit cb5d1ab

16 files changed

+48
-49
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -184,10 +184,10 @@ chooseDataTiledMMAAttr(TypeRange eTypes, IREE::GPU::TargetAttr target,
184184
//
185185
// That does simplify the below adjustments for narrow M/N, as we don't need
186186
// to think about unroll-to-subgroups when making the narrowing adjustment.
187-
int unrollMToSubgroups = 1;
188-
int unrollNToSubgroups = *wgp.getSimdsPerWgp();
189-
int unrollM = totalUnrollM / unrollMToSubgroups;
190-
int unrollN = totalUnrollN / unrollNToSubgroups;
187+
int subgroupsM = 1;
188+
int subgroupsN = *wgp.getSimdsPerWgp();
189+
int unrollM = totalUnrollM / subgroupsM;
190+
int unrollN = totalUnrollN / subgroupsN;
191191

192192
//
193193
// Step 3: Adjust the unrolling factors when there is a narrow dimension.
@@ -201,15 +201,14 @@ chooseDataTiledMMAAttr(TypeRange eTypes, IREE::GPU::TargetAttr target,
201201
}
202202
if (narrowDim.isN()) {
203203
std::swap(unrollM, unrollN);
204-
std::swap(unrollMToSubgroups, unrollNToSubgroups);
205-
assert(unrollNToSubgroups == 1);
204+
std::swap(subgroupsM, subgroupsN);
205+
assert(subgroupsN == 1);
206206
unrollN = std::min(unrollN, static_cast<int>(llvm::divideCeil(
207207
narrowDim.size, intrinsicMma.getNSize())));
208208
}
209209

210210
return DataTiledMMAAttr::get(ctx, intrinsicMma.getIntrinsic(), unrollM,
211-
unrollMToSubgroups, unrollN, unrollNToSubgroups,
212-
unrollK);
211+
subgroupsM, unrollN, subgroupsN, unrollK);
213212
}
214213

215214
static FailureOr<MaterializeEncodingInfo>

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx1100.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,5 @@ func.func @matmul_lowering_WMMA_F32_16x16x16_F16() {
5656
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
5757
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
5858
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
59-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = WMMA_F32_16x16x16_F16, unroll_m = 4, unroll_n_to_subgroups = 4>
59+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = WMMA_F32_16x16x16_F16, unroll_m = 4, subgroups_n = 4>
6060
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx908.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,5 @@ func.func @matmul_lowering_MFMA_i32_16x16x16_i8() {
5656
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
5757
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
5858
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
59-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x16_I8, unroll_m = 4, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 4>
59+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x16_I8, unroll_m = 4, unroll_n = 2, subgroups_n = 4, unroll_k = 4>
6060
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx90a.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ func.func @matmul_lowering_MFMA_f32_16x16x8_bf16() {
5656
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
5757
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
5858
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
59-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x8_BF16, unroll_m = 4, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 4>
59+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x8_BF16, unroll_m = 4, unroll_n = 2, subgroups_n = 4, unroll_k = 4>
6060
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]
6161

6262
// -----
@@ -115,5 +115,5 @@ func.func @matmul_lowering_MFMA_f64_16x16x4_f64() {
115115
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
116116
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
117117
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
118-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F64_16x16x4_F64, unroll_m = 4, unroll_n_to_subgroups = 4, unroll_k = 2>
118+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F64_16x16x4_F64, unroll_m = 4, subgroups_n = 4, unroll_k = 2>
119119
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx942.mlir

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ func.func @matmul_lowering_MFMA_F32_16x16x4_F32() {
366366
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
367367
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
368368
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
369-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 4>
369+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 4>
370370
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]
371371

372372
// -----
@@ -426,7 +426,7 @@ func.func @batch_matmul_lowering_MFMA_F32_16x16x4_F32() {
426426
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
427427
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
428428
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
429-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 4>
429+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 4>
430430
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]
431431

432432
// -----
@@ -622,7 +622,7 @@ func.func @matmul_lowering_MFMA_I32_16x16x32_I8() {
622622
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
623623
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
624624
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
625-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>
625+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2>
626626
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]
627627

628628
// -----
@@ -700,7 +700,7 @@ func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_max_load_instruction_bits
700700
// CHECK: func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_max_load_instruction_bits_64
701701
// CHECK: iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
702702
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
703-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4>
703+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, subgroups_n = 4>
704704

705705
// -----
706706

@@ -773,11 +773,11 @@ func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_max_load_instruction_bits
773773
// CHECK: func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_max_load_instruction_bits_64
774774
// CHECK: iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
775775
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
776-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 4>
776+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 4>
777777

778778
// -----
779779

780-
// Custom {simds_per_wgp = 1} => implied default {unroll_n_to_subgroups = 1} (omitted in output) and {unroll_n = 8} instead of {unroll_n_to_subgroups = 4}.
780+
// Custom {simds_per_wgp = 1} => implied default {subgroups_n = 1} (omitted in output) and {unroll_n = 8} instead of {subgroups_n = 4}.
781781

782782
#target_gfx942_except_simds_per_wgp_1 = #hal.executable.target<"rocm", "rocm-hsaco-fb", {
783783
iree.gpu.target = #iree_gpu.target<
@@ -919,7 +919,7 @@ func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_vgpr_space_bits_8192() at
919919
// CHECK: func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_vgpr_space_bits_8192
920920
// CHECK: iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
921921
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
922-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 4, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>
922+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 4, unroll_n = 2, subgroups_n = 4, unroll_k = 2>
923923

924924
// -----
925925

@@ -992,7 +992,7 @@ func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_vgpr_space_bits_4096() at
992992
// CHECK: func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_vgpr_space_bits_4096
993993
// CHECK: iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
994994
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
995-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 4, unroll_n_to_subgroups = 4, unroll_k = 2>
995+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 4, subgroups_n = 4, unroll_k = 2>
996996

997997
// -----
998998

@@ -1065,7 +1065,7 @@ func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_vgpr_space_bits_32768() a
10651065
// CHECK: func.func @matmul_lowering_MFMA_I32_16x16x32_I8_custom_vgpr_space_bits_32768
10661066
// CHECK: iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
10671067
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
1068-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 4, unroll_n_to_subgroups = 4, unroll_k = 2>
1068+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 4, subgroups_n = 4, unroll_k = 2>
10691069

10701070
// -----
10711071

@@ -1128,7 +1128,7 @@ func.func @batch_matmul_lowering_MFMA_F32_16x16x32_F8E4M3FNUZ() {
11281128
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
11291129
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
11301130
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
1131-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>
1131+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2>
11321132
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]
11331133

11341134
// -----
@@ -1188,5 +1188,5 @@ func.func @batch_matmul_lowering_MFMA_F32_16x16x16_BF16() {
11881188
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
11891189
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
11901190
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
1191-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_BF16, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>
1191+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_BF16, unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2>
11921192
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,8 @@ TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
152152
if (mma.getUnrollM() > 1) {
153153
expand(swizzle, 0, {Kind::CrossIntrinsic, mma.getUnrollM()});
154154
}
155-
if (mma.getUnrollMToSubgroups() > 1) {
156-
expand(swizzle, 0, {Kind::CrossThread, mma.getUnrollMToSubgroups()});
155+
if (mma.getSubgroupsM() > 1) {
156+
expand(swizzle, 0, {Kind::CrossThread, mma.getSubgroupsM()});
157157
}
158158
break;
159159
case IREE::GPU::MMAFragment::Rhs:
@@ -169,8 +169,8 @@ TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
169169
if (mma.getUnrollN() > 1) {
170170
expand(swizzle, 0, {Kind::CrossIntrinsic, mma.getUnrollN()});
171171
}
172-
if (mma.getUnrollNToSubgroups() > 1) {
173-
expand(swizzle, 0, {Kind::CrossThread, mma.getUnrollNToSubgroups()});
172+
if (mma.getSubgroupsN() > 1) {
173+
expand(swizzle, 0, {Kind::CrossThread, mma.getSubgroupsN()});
174174
}
175175
break;
176176
case IREE::GPU::MMAFragment::Acc:
@@ -179,14 +179,14 @@ TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
179179
if (mma.getUnrollN() > 1) {
180180
expand(swizzle, 1, {Kind::CrossIntrinsic, mma.getUnrollN()});
181181
}
182-
if (mma.getUnrollNToSubgroups() > 1) {
183-
expand(swizzle, 1, {Kind::CrossThread, mma.getUnrollNToSubgroups()});
182+
if (mma.getSubgroupsN() > 1) {
183+
expand(swizzle, 1, {Kind::CrossThread, mma.getSubgroupsN()});
184184
}
185185
if (mma.getUnrollM() > 1) {
186186
expand(swizzle, 0, {Kind::CrossIntrinsic, mma.getUnrollM()});
187187
}
188-
if (mma.getUnrollMToSubgroups() > 1) {
189-
expand(swizzle, 0, {Kind::CrossThread, mma.getUnrollMToSubgroups()});
188+
if (mma.getSubgroupsM() > 1) {
189+
expand(swizzle, 0, {Kind::CrossThread, mma.getSubgroupsM()});
190190
}
191191
break;
192192
}

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -939,8 +939,8 @@ std::tuple<Type, Type, Type> DataTiledMMAAttr::getABCElementTypes() const {
939939
std::tuple<int64_t, int64_t, int64_t> DataTiledMMAAttr::getMNKShape() const {
940940
MLIRContext *ctx = getContext();
941941
auto opaqueLayout = getOpaqueMFMALayout(ctx, getIntrinsic().getValue());
942-
return {opaqueLayout.mSize * getUnrollM() * getUnrollMToSubgroups(),
943-
opaqueLayout.nSize * getUnrollN() * getUnrollNToSubgroups(),
942+
return {opaqueLayout.mSize * getUnrollM() * getSubgroupsM(),
943+
opaqueLayout.nSize * getUnrollN() * getSubgroupsN(),
944944
opaqueLayout.kSize * getUnrollK()};
945945
}
946946

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,9 @@ def IREEGPU_DataTiledMMAAttr :
279279
let parameters = (ins
280280
"::mlir::iree_compiler::IREE::GPU::MMAIntrinsicAttr":$intrinsic,
281281
DefaultValuedParameter<"int64_t", "1", "Unrolling along the M dimension, on the same thread.">:$unroll_m,
282-
DefaultValuedParameter<"int64_t", "1", "Unrolling along the M dimension, distributed across this many more threads.">:$unroll_m_to_subgroups,
282+
DefaultValuedParameter<"int64_t", "1", "Unrolling along the M dimension, distributed across this many more threads.">:$subgroups_m,
283283
DefaultValuedParameter<"int64_t", "1", "Unrolling along the N dimension, on the same thread.">:$unroll_n,
284-
DefaultValuedParameter<"int64_t", "1", "Unrolling along the N dimension, distributed across this many more threads.">:$unroll_n_to_subgroups,
284+
DefaultValuedParameter<"int64_t", "1", "Unrolling along the N dimension, distributed across this many more threads.">:$subgroups_n,
285285
DefaultValuedParameter<"int64_t", "1", "Unrolling along the K dimension, on the same thread, with interleaved layout.">:$unroll_k
286286
);
287287
}

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,21 @@ module {
2929

3030
module {
3131
func.func @test_data_tiled_mfma_f32_16x16x4_f32() attributes {
32-
mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 4, unroll_m_to_subgroups = 2, unroll_k = 1>} {
32+
mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 4, subgroups_m = 2, unroll_k = 1>} {
3333
return
3434
}
3535
}
3636
// CHECK-LABEL: func @test_data_tiled_mfma_f32_16x16x4_f32
37-
// CHECK-SAME: mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 4, unroll_m_to_subgroups = 2>
37+
// CHECK-SAME: mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 4, subgroups_m = 2>
3838

3939
module {
4040
func.func @test_data_tiled_mfma_f32_16x16x16_f16() attributes {
41-
mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, unroll_m = 1, unroll_n_to_subgroups = 2, unroll_k = 2>} {
41+
mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, unroll_m = 1, subgroups_n = 2, unroll_k = 2>} {
4242
return
4343
}
4444
}
4545
// CHECK-LABEL: func @test_data_tiled_mfma_f32_16x16x16_f16
46-
// CHECK-SAME: mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, unroll_n_to_subgroups = 2, unroll_k = 2>
46+
// CHECK-SAME: mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, subgroups_n = 2, unroll_k = 2>
4747

4848
module {
4949
func.func @test_data_tiled_mfma_i32_16x16x32_i8() attributes {

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ func.func @data_tiled_2x2x4_tensor_multi_mma(%lhs: tensor<?x?x2x4x16x1x4xf32>, %
281281
%0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
282282
indexing_maps = #contraction_accesses,
283283
iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
284-
kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m_to_subgroups = 2, unroll_n_to_subgroups = 2, unroll_k = 4>
284+
kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, subgroups_m = 2, subgroups_n = 2, unroll_k = 4>
285285
} : tensor<?x?x2x4x16x1x4xf32>, tensor<?x?x2x4x16x1x4xf32> into tensor<?x?x2x2x4x16x4x1xf32>
286286
return %0 : tensor<?x?x2x2x4x16x4x1xf32>
287287
}
@@ -294,7 +294,7 @@ func.func @data_tiled_2x2x4_tensor_multi_mma(%lhs: tensor<?x?x2x4x16x1x4xf32>, %
294294
// CHECK: iree_gpu.multi_mma %arg0, %arg1, %arg2
295295
// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
296296
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
297-
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m_to_subgroups = 2, unroll_n_to_subgroups = 2, unroll_k = 4>
297+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, subgroups_m = 2, subgroups_n = 2, unroll_k = 4>
298298
// CHECK-SAME: : tensor<?x?x2x4x16x1x4xf32>, tensor<?x?x2x4x16x1x4xf32> into tensor<?x?x2x2x4x16x4x1xf32>
299299

300300

0 commit comments

Comments
 (0)