Skip to content

Commit 1885d10

Browse files
committed
Update according to changes.
1 parent be977e2 commit 1885d10

File tree

8 files changed

+65
-32
lines changed

8 files changed

+65
-32
lines changed

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ iree_compiler_cc_library(
3333
"//compiler/src/iree/compiler/Dialect/LinalgExt/Utils",
3434
"//compiler/src/iree/compiler/Dialect/TensorExt/IR",
3535
"@llvm-project//llvm:Support",
36+
"@llvm-project//mlir:AMDGPUUtils",
3637
"@llvm-project//mlir:Analysis",
3738
"@llvm-project//mlir:DialectUtils",
3839
"@llvm-project//mlir:FunctionInterfaces",

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ iree_cc_library(
2020
"ReductionConfigUtils.cpp"
2121
DEPS
2222
LLVMSupport
23+
MLIRAMDGPUUtils
2324
MLIRAnalysis
2425
MLIRFunctionInterfaces
2526
MLIRIR

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
835835
// - Padding requires C promotion, OR
836836
// - The operation has an existing accumulator (matmul_accumulate)
837837
bool doCPromotion =
838-
(couldNeedPadding && cPromoteIfPadding) || hasExistingAccumulator;
838+
(couldNeedPadding && CPromoteIfPadding) || hasExistingAccumulator;
839839

840840
bool mustBeAligned = true;
841841
Location loc = operands[0].getLoc();
@@ -848,7 +848,7 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
848848
mustBeAligned = false;
849849
// For unaligned schedules, C promotion is needed for padding OR existing
850850
// accumulator.
851-
bool doCPromotionUnaligned = cPromoteIfPadding || hasExistingAccumulator;
851+
bool doCPromotionUnaligned = CPromoteIfPadding || hasExistingAccumulator;
852852
schedule = getMmaScheduleFromProblemAndTarget(
853853
target, problem, loc, transposedLhs, transposedRhs, isGemm,
854854
mustBeAligned, doCPromotionUnaligned, scaled, splitReductionTripCnt);
@@ -1090,10 +1090,20 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
10901090
std::array<int64_t, 3> workgroupSize = {configAndWgSize->second, 1, 1};
10911091
LoweringConfigAttr loweringConfig = configAndWgSize->first;
10921092

1093+
// Check if target supports global load DMA (gfx950+). Only disable bank
1094+
// conflict reduction for targets that will use direct load DMA.
1095+
bool supportsGlobalLoadDMA = false;
1096+
StringRef targetArch = target.getArch();
1097+
if (auto maybeChipset = amdgpu::Chipset::parse(targetArch);
1098+
succeeded(maybeChipset)) {
1099+
constexpr amdgpu::Chipset kGfx950{9, 5, 0};
1100+
supportsGlobalLoadDMA = (*maybeChipset >= kGfx950);
1101+
}
1102+
10931103
SmallVector<NamedAttribute, 1> pipelineAttrs;
10941104
auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
10951105
linalgOp->getContext(), /*prefetchNumStages=*/2,
1096-
/*no_reduce_shared_memory_bank_conflicts=*/true,
1106+
/*no_reduce_shared_memory_bank_conflicts=*/supportsGlobalLoadDMA,
10971107
/*use_igemm_convolution=*/true,
10981108
/*reorder_workgroups_strategy=*/std::nullopt);
10991109
pipelineAttrs.emplace_back(
@@ -1140,15 +1150,15 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
11401150
FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
11411151
getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
11421152
bounds, maps, operands, target, /*isGemm=*/true,
1143-
/*scaled=*/false, splitReductionTripCnt, CPromoteIfPadding,
1153+
/*scaled=*/false, splitReductionTripCnt, cPromoteIfPadding,
11441154
hasExistingAccumulator);
11451155

11461156
// TODO (muzasyed) : add generalization for scaled and nonscaled versions of
11471157
// matmul lowering.
11481158
if (failed(configAndWgSize)) {
11491159
configAndWgSize = getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
11501160
bounds, maps, operands, target, /*isGemm=*/true,
1151-
/*scaled=*/true, splitReductionTripCnt, CPromoteIfPadding,
1161+
/*scaled=*/true, splitReductionTripCnt, cPromoteIfPadding,
11521162
hasExistingAccumulator);
11531163
}
11541164

@@ -1158,10 +1168,20 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
11581168
std::array<int64_t, 3> workgroupSize = {configAndWgSize->second, 1, 1};
11591169
LoweringConfigAttr loweringConfig = configAndWgSize->first;
11601170

1171+
// Check if target supports global load DMA (gfx950+). Only disable bank
1172+
// conflict reduction for targets that will use direct load DMA.
1173+
bool supportsGlobalLoadDMA = false;
1174+
StringRef targetArch = target.getArch();
1175+
if (auto maybeChipset = amdgpu::Chipset::parse(targetArch);
1176+
succeeded(maybeChipset)) {
1177+
constexpr amdgpu::Chipset kGfx950{9, 5, 0};
1178+
supportsGlobalLoadDMA = (*maybeChipset >= kGfx950);
1179+
}
1180+
11611181
SmallVector<NamedAttribute, 1> pipelineAttrs;
11621182
auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
11631183
linalgOp->getContext(), /*prefetchNumStages=*/2,
1164-
/*no_reduce_shared_memory_bank_conflicts=*/true,
1184+
/*no_reduce_shared_memory_bank_conflicts=*/supportsGlobalLoadDMA,
11651185
/*use_igemm_convolution=*/false,
11661186
/*reorder_workgroups_strategy=*/std::nullopt);
11671187
pipelineAttrs.emplace_back(

compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "llvm/Support/InterleavedRange.h"
3636
#include "llvm/Support/LogicalResult.h"
3737
#include "mlir/Analysis/SliceAnalysis.h"
38+
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
3839
#include "mlir/Dialect/Linalg/IR/Linalg.h"
3940
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
4041
#include "mlir/IR/Attributes.h"
@@ -1446,12 +1447,12 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
14461447
return failure();
14471448
}
14481449

1449-
auto setMatmulConfig = [&entryPoint, &op](int64_t tileX, int64_t tileY,
1450-
int64_t tileK,
1451-
ArrayRef<int64_t> workgroupSize,
1452-
ArrayRef<int32_t> subgroupSizes,
1453-
unsigned softwarePipelineDepth,
1454-
CodeGenPipeline pipeline) {
1450+
auto setMatmulConfig = [&entryPoint, &op,
1451+
&target](int64_t tileX, int64_t tileY, int64_t tileK,
1452+
ArrayRef<int64_t> workgroupSize,
1453+
ArrayRef<int32_t> subgroupSizes,
1454+
unsigned softwarePipelineDepth,
1455+
CodeGenPipeline pipeline) {
14551456
TileSizesListType tileSizes;
14561457
unsigned numParallelLoops = op.getNumParallelLoops();
14571458
unsigned numReductionLoops = op.getNumReductionLoops();
@@ -1506,10 +1507,20 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
15061507
auto configDict = b.getDictionaryAttr(attrs);
15071508
auto loweringConfig =
15081509
IREE::GPU::LoweringConfigAttr::get(context, configDict);
1510+
// Check if target supports global load DMA (gfx950+). Only disable bank
1511+
// conflict reduction for targets that will use direct load DMA.
1512+
bool supportsGlobalLoadDMA = false;
1513+
StringRef targetArch = target.getArch();
1514+
if (auto maybeChipset = amdgpu::Chipset::parse(targetArch);
1515+
succeeded(maybeChipset)) {
1516+
constexpr amdgpu::Chipset kGfx950{9, 5, 0};
1517+
supportsGlobalLoadDMA = (*maybeChipset >= kGfx950);
1518+
}
1519+
15091520
SmallVector<NamedAttribute, 1> pipelineAttrs;
15101521
auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
15111522
context, /*prefetch_num_stages=*/0,
1512-
/*no_reduce_shared_memory_bank_conflicts=*/true,
1523+
/*no_reduce_shared_memory_bank_conflicts=*/supportsGlobalLoadDMA,
15131524
/*use_igemm_convolution=*/false,
15141525
/*reorder_workgroups_strategy=*/std::nullopt);
15151526
pipelineAttrs.emplace_back(

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ func.func @nhwc_conv_mfma(%3: tensor<2x34x34x128xf32>, %4: tensor<3x3x128x64xf32
1717

1818
// CHECK-LABEL: func.func @nhwc_conv_mfma
1919
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
20-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true
20+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false
2121
// CHECK-SAME: use_igemm_convolution = true
2222

2323
// CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
@@ -44,7 +44,7 @@ func.func @nchw_conv_mfma(%3: tensor<2x128x34x34xf32>, %4: tensor<64x128x3x3xf32
4444

4545
// CHECK-LABEL: func.func @nchw_conv_mfma
4646
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
47-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true
47+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false
4848
// CHECK-SAME: use_igemm_convolution = true
4949

5050
// CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
@@ -71,7 +71,7 @@ func.func @nhwc_conv_unaligned_mfma(%3: tensor<2x33x33x128xf32>, %4: tensor<3x3x
7171

7272
// CHECK-LABEL: func.func @nhwc_conv_unaligned_mfma
7373
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
74-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true
74+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false
7575
// CHECK-SAME: use_igemm_convolution = true
7676

7777
// CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
@@ -103,7 +103,7 @@ func.func @nchw_conv_unaligned_mfma(%3: tensor<2x128x34x34xf32>, %4: tensor<63x1
103103

104104
// CHECK-LABEL: func.func @nchw_conv_unaligned_mfma
105105
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
106-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true
106+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false
107107
// CHECK-SAME: use_igemm_convolution = true
108108

109109
// CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
@@ -142,7 +142,7 @@ func.func @conv_nhwc_fhwc_unaligned_channel(%arg0: tensor<16x26x19x287xf16>, %ar
142142

143143
// CHECK-LABEL: func.func @conv_nhwc_fhwc_unaligned_channel
144144
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
145-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true
145+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false
146146
// CHECK-SAME: use_igemm_convolution = true
147147

148148
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
@@ -181,7 +181,7 @@ func.func @conv_chwn_chwf_unaligned_batch(%arg0: tensor<16x193x129x40xbf16>, %ar
181181

182182
// CHECK-LABEL: func.func @conv_chwn_chwf_unaligned_batch
183183
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
184-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true
184+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false
185185
// CHECK-SAME: use_igemm_convolution = true
186186

187187
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
@@ -213,7 +213,7 @@ func.func @group_conv_hwgc_gfhwc_unaligned(%arg0: tensor<61x93x16x55xbf16>, %arg
213213

214214
// CHECK-LABEL: func.func @group_conv_hwgc_gfhwc_unaligned
215215
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
216-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true
216+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false
217217
// CHECK-SAME: use_igemm_convolution = true
218218

219219
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
@@ -253,7 +253,7 @@ module {
253253

254254
// CHECK-LABEL: func.func @conv_nhwc_filter_5x1_unaligned
255255
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
256-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true
256+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false
257257
// CHECK-SAME: use_igemm_convolution = true
258258

259259
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
3030

3131
// CHECK-LABEL: func.func @expanded_matmul_transpose_b(
3232
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
33-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>
33+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>
3434

3535
// Verify that the fill does not have the lowering config propagated to it.
3636
// CHECK: linalg.fill ins
@@ -67,7 +67,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
6767

6868
// CHECK-LABEL: func.func @multi_dim_mma_schedule(
6969
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
70-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>
70+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>
7171

7272
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
7373
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
@@ -104,7 +104,7 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
104104

105105
// CHECK-LABEL: func.func @dynamic_multi_dim_mma_schedule(
106106
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
107-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>
107+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>
108108

109109
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
110110
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
@@ -125,7 +125,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
125125

126126
// CHECK-LABEL: func.func @mfma_matmul_1024x1024x1024(
127127
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
128-
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>
128+
// CHECK-SAME: #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>
129129

130130
// Verify that the fill does not have the lowering config propagated to it.
131131
// CHECK: linalg.fill ins
@@ -322,7 +322,7 @@ func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x8x577xf32>, %r
322322

323323
// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul(
324324
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [128, 1, 1] subgroup_size = 64
325-
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}
325+
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
326326
// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
327327
// CHECK-SAME: reduction = [0, 0, 0, 1]
328328
// CHECK-SAME: subgroup = [0, 1, 2, 0]
@@ -419,7 +419,7 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5
419419

420420
// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(
421421
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
422-
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}
422+
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
423423
// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
424424
// CHECK-SAME: padding = [1, 64, 128, 4]
425425
// CHECK-SAME: promote_operands = [0, 1]
@@ -442,7 +442,7 @@ func.func @unaligned_matmul_nn_layout(%lhs : tensor<513x513xf16>, %rhs : tensor<
442442

443443
// CHECK-LABEL: func.func @unaligned_matmul_nn_layout(
444444
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
445-
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}
445+
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
446446
// CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
447447
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
448448
// CHECK-SAME: padding = [64, 128, 16]

0 commit comments

Comments
 (0)