Skip to content

Commit 1fc6e5b

Browse files
authored
Add CDNA3 MFMA BF16 intrinsics. (#18892)
Signed-off-by: Benoit Jacob <[email protected]>
1 parent 3b751a4 commit 1fc6e5b

File tree

6 files changed

+155
-4
lines changed

6 files changed

+155
-4
lines changed

compiler/plugins/target/ROCM/test/target_device_features.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// GFX942: target = #iree_gpu.target<arch = "gfx942",
1616
// GFX942-SAME: wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8,
1717
// GFX942-SAME: subgroup = shuffle|arithmetic, dot = dp4xi8toi32,
18-
// GFX942-SAME: mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
18+
// GFX942-SAME: mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
1919
// GFX942-SAME: subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024],
2020
// GFX942-SAME: max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
2121
// GFX942-SAME: max_workgroup_counts = [2147483647, 2147483647, 2147483647],
@@ -26,7 +26,7 @@
2626
// GFX941-SAME: features = "+sramecc,-xnack"
2727

2828
// GFX940: target = #iree_gpu.target<arch = "gfx940",
29-
// GFX940-SAME: mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
29+
// GFX940-SAME: mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
3030

3131
// GFX1100: target = #iree_gpu.target<arch = "gfx1100",
3232
// GFX1100-SAME: mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>, <WMMA_I32_16x16x16_I8>]

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,3 +1130,63 @@ func.func @batch_matmul_lowering_MFMA_F32_16x16x32_F8E4M3FNUZ() {
11301130
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
11311131
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>
11321132
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]
1133+
1134+
// -----
1135+
1136+
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
1137+
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
1138+
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
1139+
#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [bf16, bf16, f32], user_indexing_maps = [#map, #map1, #map2]>
1140+
#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [bf16, bf16, f32], user_indexing_maps = [#map, #map1, #map2]>
1141+
#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [bf16, bf16, f32], user_indexing_maps = [#map, #map1, #map2]>
1142+
#pipeline_layout_4 = #hal.pipeline.layout<constants = 4, bindings = [
1143+
#hal.pipeline.binding<storage_buffer>,
1144+
#hal.pipeline.binding<storage_buffer>,
1145+
#hal.pipeline.binding<storage_buffer>
1146+
]>
1147+
func.func @batch_matmul_lowering_MFMA_F32_16x16x16_BF16() {
1148+
%c0 = arith.constant 0 : index
1149+
%B = hal.interface.constant.load layout(#pipeline_layout_4) ordinal(0) : index
1150+
%M = hal.interface.constant.load layout(#pipeline_layout_4) ordinal(1) : index
1151+
%N = hal.interface.constant.load layout(#pipeline_layout_4) ordinal(2) : index
1152+
%K = hal.interface.constant.load layout(#pipeline_layout_4) ordinal(3) : index
1153+
%0 = hal.interface.binding.subspan layout(#pipeline_layout_4) binding(0) alignment(64) offset(%c0)
1154+
: !flow.dispatch.tensor<readonly:tensor<?x?x?xbf16, #encoding_lhs>>{%B, %M, %K}
1155+
%1 = hal.interface.binding.subspan layout(#pipeline_layout_4) binding(1) alignment(64) offset(%c0)
1156+
: !flow.dispatch.tensor<readonly:tensor<?x?x?xbf16, #encoding_rhs>>{%B, %K, %N}
1157+
%2 = hal.interface.binding.subspan layout(#pipeline_layout_4) binding(2) alignment(64) offset(%c0)
1158+
: !flow.dispatch.tensor<readwrite:tensor<?x?x?xf32, #encoding_result>>{%B, %M, %N}
1159+
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [%B, %M, %K], strides = [1, 1, 1]
1160+
: !flow.dispatch.tensor<readonly:tensor<?x?x?xbf16, #encoding_lhs>>{%B, %M, %K}
1161+
-> tensor<?x?x?xbf16, #encoding_lhs>
1162+
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [%B, %K, %N], strides = [1, 1, 1]
1163+
: !flow.dispatch.tensor<readonly:tensor<?x?x?xbf16, #encoding_rhs>>{%B, %K, %N}
1164+
-> tensor<?x?x?xbf16, #encoding_rhs>
1165+
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [%B, %M, %N], strides = [1, 1, 1]
1166+
: !flow.dispatch.tensor<readwrite:tensor<?x?x?xf32, #encoding_result>>{%B, %M, %N}
1167+
-> tensor<?x?x?xf32, #encoding_result>
1168+
%6 = linalg.batch_matmul
1169+
ins(%3, %4 : tensor<?x?x?xbf16, #encoding_lhs>,
1170+
tensor<?x?x?xbf16, #encoding_rhs>)
1171+
outs(%5 : tensor<?x?x?xf32, #encoding_result>)
1172+
-> tensor<?x?x?xf32, #encoding_result>
1173+
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [%B, %M, %N], strides = [1, 1, 1]
1174+
: tensor<?x?x?xf32, #encoding_result>
1175+
-> !flow.dispatch.tensor<readwrite:tensor<?x?x?xf32, #encoding_result>>{%B, %M, %N}
1176+
return
1177+
}
1178+
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
1179+
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
1180+
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
1181+
// CHECK: func.func @batch_matmul_lowering_MFMA_F32_16x16x16_BF16
1182+
// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0)
1183+
// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1)
1184+
// CHECK-DAG: %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2)
1185+
// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor<?x?x?x8x4x16x2x4xbf16>
1186+
// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor<?x?x?x4x2x4x16x2x4xbf16>
1187+
// CHECK-DAG: %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor<?x?x?x8x4x2x4x16x4xf32>
1188+
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
1189+
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
1190+
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
1191+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_BF16, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>
1192+
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ static OpaqueMmaLayout getOpaqueMFMALayout(MLIRContext *context,
214214
Type f8E4M3FNUZ = Float8E4M3FNUZType::get(context);
215215
Type f8E5M2FNUZ = Float8E5M2FNUZType::get(context);
216216
Type f16 = Float16Type::get(context);
217+
Type bf16 = BFloat16Type::get(context);
217218
Type f32 = Float32Type::get(context);
218219

219220
Type i8 = IntegerType::get(context, 8);
@@ -229,6 +230,12 @@ static OpaqueMmaLayout getOpaqueMFMALayout(MLIRContext *context,
229230
case MMAIntrinsic::MFMA_F32_32x32x8_F16: {
230231
return OpaqueMmaLayout{32, 32, 8, f16, f16, f32};
231232
}
233+
case MMAIntrinsic::MFMA_F32_16x16x16_BF16: {
234+
return OpaqueMmaLayout{16, 16, 16, bf16, bf16, f32};
235+
}
236+
case MMAIntrinsic::MFMA_F32_32x32x8_BF16: {
237+
return OpaqueMmaLayout{32, 32, 8, bf16, bf16, f32};
238+
}
232239
case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: {
233240
return OpaqueMmaLayout{16, 16, 32, f8E4M3FNUZ, f8E4M3FNUZ, f32};
234241
}
@@ -336,6 +343,45 @@ static ConcreteMmaLayout getConcreteMFMALayout(MLIRContext *context,
336343
return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout,
337344
bNLayout, cMLayout, cNLayout};
338345
}
346+
case MMAIntrinsic::MFMA_F32_16x16x16_BF16: {
347+
// #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]>
348+
// #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [4, 4]>
349+
// #layout_a = #iree_vector_ext.layout<#outer, #inner>
350+
// #layout_b = #iree_vector_ext.layout<#inner, #outer>
351+
// #layout_c = #iree_vector_ext.layout<#inner, #outer>
352+
353+
auto outer = PerDimLayoutAttr::get(context, {laneX}, {16});
354+
auto inner = PerDimLayoutAttr::get(context, {laneY, vectorX}, {4, 4});
355+
auto aMLayout = outer;
356+
auto aKLayout = inner;
357+
auto bKLayout = inner;
358+
auto bNLayout = outer;
359+
auto cMLayout = inner;
360+
auto cNLayout = outer;
361+
return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout,
362+
bNLayout, cMLayout, cNLayout};
363+
}
364+
case MMAIntrinsic::MFMA_F32_32x32x8_BF16: {
365+
// #outer = #iree_vector_ext.per_dim_layout<[LANEX], [32]>
366+
// #inner1 = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [2, 4]>
367+
// #inner2 = #iree_vector_ext.per_dim_layout<[VECTORY, LANEY, VECTORX],
368+
// [4, 2, 4]>
369+
// #layout_a = #iree_vector_ext.layout<#outer, #inner1>
370+
// #layout_b = #iree_vector_ext.layout<#inner1, #outer>
371+
// #layout_c = #iree_vector_ext.layout<#inner2, #outer>
372+
373+
auto outer = PerDimLayoutAttr::get(context, {laneX}, {32});
374+
auto inner = PerDimLayoutAttr::get(context, {laneY, vectorX}, {2, 4});
375+
auto aMLayout = outer;
376+
auto aKLayout = inner;
377+
auto bKLayout = inner;
378+
auto bNLayout = outer;
379+
auto cMLayout =
380+
PerDimLayoutAttr::get(context, {vectorY, laneY, vectorX}, {4, 2, 4});
381+
auto cNLayout = outer;
382+
return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout,
383+
bNLayout, cMLayout, cNLayout};
384+
}
339385
case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ:
340386
case MMAIntrinsic::MFMA_I32_16x16x32_I8: {
341387
// #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]>
@@ -462,14 +508,16 @@ MMAAttr::getABCVectorTypes() const {
462508
return std::make_tuple(aType, bType, cType);
463509
}
464510
case MMAIntrinsic::MFMA_I32_16x16x16_I8:
465-
case MMAIntrinsic::MFMA_F32_16x16x16_F16: {
511+
case MMAIntrinsic::MFMA_F32_16x16x16_F16:
512+
case MMAIntrinsic::MFMA_F32_16x16x16_BF16: {
466513
auto aType = VectorType::get({4}, getAType());
467514
auto bType = VectorType::get({4}, getBType());
468515
auto cType = VectorType::get({4}, getCType());
469516
return std::make_tuple(aType, bType, cType);
470517
}
471518
case MMAIntrinsic::MFMA_I32_32x32x8_I8:
472-
case MMAIntrinsic::MFMA_F32_32x32x8_F16: {
519+
case MMAIntrinsic::MFMA_F32_32x32x8_F16:
520+
case MMAIntrinsic::MFMA_F32_32x32x8_BF16: {
473521
auto aType = VectorType::get({4}, getAType());
474522
auto bType = VectorType::get({4}, getBType());
475523
auto cType = VectorType::get({16}, getCType());
@@ -519,8 +567,10 @@ int64_t MMAAttr::getBlockSize() const {
519567
switch (getIntrinsic().getValue()) {
520568
case MMAIntrinsic::MFMA_F32_16x16x4_F32:
521569
case MMAIntrinsic::MFMA_F32_16x16x16_F16:
570+
case MMAIntrinsic::MFMA_F32_16x16x16_BF16:
522571
case MMAIntrinsic::MFMA_I32_16x16x16_I8:
523572
case MMAIntrinsic::MFMA_F32_32x32x8_F16:
573+
case MMAIntrinsic::MFMA_F32_32x32x8_BF16:
524574
case MMAIntrinsic::MFMA_I32_32x32x8_I8:
525575
case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ:
526576
case MMAIntrinsic::MFMA_F32_16x16x32_F8E5M2FNUZ:
@@ -540,8 +590,10 @@ static int64_t getIntrinsicSubgroupSize(MMAIntrinsic intrinsic) {
540590
switch (intrinsic) {
541591
case MMAIntrinsic::MFMA_F32_16x16x4_F32:
542592
case MMAIntrinsic::MFMA_F32_16x16x16_F16:
593+
case MMAIntrinsic::MFMA_F32_16x16x16_BF16:
543594
case MMAIntrinsic::MFMA_I32_16x16x16_I8:
544595
case MMAIntrinsic::MFMA_F32_32x32x8_F16:
596+
case MMAIntrinsic::MFMA_F32_32x32x8_BF16:
545597
case MMAIntrinsic::MFMA_I32_32x32x8_I8:
546598
case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ:
547599
case MMAIntrinsic::MFMA_F32_16x16x32_F8E5M2FNUZ:
@@ -584,6 +636,7 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
584636
}
585637
case MMAIntrinsic::MFMA_I32_16x16x16_I8:
586638
case MMAIntrinsic::MFMA_F32_16x16x16_F16:
639+
case MMAIntrinsic::MFMA_F32_16x16x16_BF16:
587640
switch (fragment) {
588641
case MMAFragment::Lhs:
589642
return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*tstrides=*/{1, 16},
@@ -597,6 +650,7 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
597650
}
598651
case MMAIntrinsic::MFMA_I32_32x32x8_I8:
599652
case MMAIntrinsic::MFMA_F32_32x32x8_F16:
653+
case MMAIntrinsic::MFMA_F32_32x32x8_BF16:
600654
switch (fragment) {
601655
case MMAFragment::Lhs:
602656
return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*tstrides=*/{1, 32},
@@ -704,8 +758,10 @@ FailureOr<Value> MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc,
704758
}
705759
case MMAIntrinsic::MFMA_I32_16x16x16_I8:
706760
case MMAIntrinsic::MFMA_F32_16x16x16_F16:
761+
case MMAIntrinsic::MFMA_F32_16x16x16_BF16:
707762
case MMAIntrinsic::MFMA_I32_32x32x8_I8:
708763
case MMAIntrinsic::MFMA_F32_32x32x8_F16:
764+
case MMAIntrinsic::MFMA_F32_32x32x8_BF16:
709765
case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ:
710766
case MMAIntrinsic::MFMA_F32_16x16x32_F8E5M2FNUZ:
711767
case MMAIntrinsic::MFMA_I32_16x16x32_I8:

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ class IREEGPU_I32MmaEnumAttr<string name, string summary, list<I32EnumAttrCase>
121121
def MFMA_F32_16x16x4_F32 : I32EnumAttrCase<"MFMA_F32_16x16x4_F32", 0x0900>;
122122
def MFMA_F32_16x16x16_F16 : I32EnumAttrCase<"MFMA_F32_16x16x16_F16", 0x0910>;
123123
def MFMA_F32_32x32x8_F16 : I32EnumAttrCase<"MFMA_F32_32x32x8_F16", 0x0911>;
124+
def MFMA_F32_16x16x16_BF16 : I32EnumAttrCase<"MFMA_F32_16x16x16_BF16", 0x0920>;
125+
def MFMA_F32_32x32x8_BF16 : I32EnumAttrCase<"MFMA_F32_32x32x8_BF16", 0x0921>;
124126
def MFMA_F32_16x16x32_F8E5M2FNUZ : I32EnumAttrCase<"MFMA_F32_16x16x32_F8E5M2FNUZ", 0x0930>;
125127
def MFMA_F32_16x16x32_F8E4M3FNUZ : I32EnumAttrCase<"MFMA_F32_16x16x32_F8E4M3FNUZ", 0x0940>;
126128
def MFMA_I32_16x16x32_I8 : I32EnumAttrCase<"MFMA_I32_16x16x32_I8", 0x0980>;
@@ -143,6 +145,8 @@ def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic",
143145
MFMA_F32_16x16x4_F32,
144146
MFMA_F32_16x16x16_F16,
145147
MFMA_F32_32x32x8_F16,
148+
MFMA_F32_16x16x16_BF16,
149+
MFMA_F32_32x32x8_BF16,
146150
MFMA_F32_16x16x32_F8E4M3FNUZ,
147151
MFMA_F32_16x16x32_F8E5M2FNUZ,
148152
MFMA_I32_16x16x32_I8,

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ const WgpDetails *getCDNA3WgpDetails() {
136136
MMAIntrinsic::MFMA_F32_16x16x4_F32,
137137
MMAIntrinsic::MFMA_F32_16x16x16_F16,
138138
MMAIntrinsic::MFMA_F32_32x32x8_F16,
139+
MMAIntrinsic::MFMA_F32_16x16x16_BF16,
140+
MMAIntrinsic::MFMA_F32_32x32x8_BF16,
139141
MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ,
140142
MMAIntrinsic::MFMA_F32_16x16x32_F8E5M2FNUZ,
141143
MMAIntrinsic::MFMA_I32_16x16x32_I8,

tests/e2e/matmul/CMakeLists.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1570,6 +1570,35 @@ iree_generated_e2e_runner_test(
15701570
"requires-gpu-cdna3"
15711571
)
15721572

1573+
iree_generated_e2e_runner_test(
1574+
NAME
1575+
e2e_matmul_rocm_bf16_cdna3_mfma_data_tiled
1576+
TEST_TYPE
1577+
matmul
1578+
GENERATOR
1579+
"generate_e2e_matmul_tests.py"
1580+
GENERATOR_ARGS
1581+
"--lhs_rhs_type=bf16"
1582+
"--acc_type=f32"
1583+
TEST_RUNNER
1584+
iree_tools_testing_e2e_iree-e2e-matmul-test
1585+
TARGET_BACKENDS
1586+
"rocm"
1587+
DRIVERS
1588+
"hip"
1589+
COMPILER_FLAGS
1590+
${IREE_HIP_TEST_COMPILER_FLAGS}
1591+
"--iree-opt-data-tiling"
1592+
"--iree-global-opt-experimental-rocm-data-tiling"
1593+
"--iree-global-opt-enable-early-materialization=true"
1594+
LABELS
1595+
"noasan"
1596+
"nomsan"
1597+
"notsan"
1598+
"noubsan"
1599+
"requires-gpu-cdna3"
1600+
)
1601+
15731602
iree_generated_e2e_runner_test(
15741603
NAME
15751604
e2e_matmul_rocm_i8_cdna3_mfma_data_tiled

0 commit comments

Comments
 (0)