Skip to content

Commit be977e2

Browse files
committed
Fix issues (but not complete)
1 parent ba5d55a commit be977e2

File tree

4 files changed

+26
-13
lines changed

4 files changed

+26
-13
lines changed

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "llvm/Support/Casting.h"
2525
#include "llvm/Support/DebugLog.h"
2626
#include "llvm/Support/InterleavedRange.h"
27+
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
2728
#include "mlir/Dialect/Linalg/Utils/Utils.h"
2829
#include "mlir/Dialect/Utils/IndexingUtils.h"
2930
#include "mlir/IR/Attributes.h"
@@ -926,10 +927,22 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
926927
{"subgroup", b.getI64ArrayAttr(subgroupTileSizes)},
927928
{"mma_kind", kind}};
928929

930+
// Check if target supports global load DMA (gfx950+).
931+
bool supportsGlobalLoadDMA = false;
932+
StringRef targetArch = target.getArch();
933+
if (auto maybeChipset = amdgpu::Chipset::parse(targetArch);
934+
succeeded(maybeChipset)) {
935+
constexpr amdgpu::Chipset kGfx950{9, 5, 0};
936+
supportsGlobalLoadDMA = (*maybeChipset >= kGfx950);
937+
}
938+
929939
// Use global load DMA attribute (subgroup sizes will be derived from
930-
// translation_info).
931-
Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context);
932-
SmallVector<Attribute> promotionArray = {useGlobalDma, useGlobalDma};
940+
// translation_info) only on gfx950+.
941+
SmallVector<Attribute> promotionArray;
942+
if (supportsGlobalLoadDMA) {
943+
Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context);
944+
promotionArray = {useGlobalDma, useGlobalDma};
945+
}
933946
SmallVector<int64_t> promotionList = {0, 1};
934947
if (scaled) {
935948
// TODO(#22119): We don't use global load DMA for scaled matmuls, because

compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
4040
// CHECK-SAME: lowering_config = #[[$CONFIG]]
4141
// CHECK: ^bb
4242
// CHECK: linalg.matmul
43-
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 16], subgroup = [2, 4, 0], workgroup = [64, 128, 0]
43+
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 16], subgroup = [2, 4, 0], workgroup = [64, 128, 0]
4444
// CHECK: iree_linalg_ext.yield
4545

4646
// -----

compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matmul.mlir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ func.func @matmul_32_32_32(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
2020
// GENERALIZED: linalg.generic
2121
// SPECIALIZED: linalg.matmul
2222
// CHECK: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
23-
// CHECK-SAME: promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 8], subgroup = [1, 1, 0],
23+
// CHECK-SAME: promote_operands = [0, 1], reduction = [0, 0, 8], subgroup = [1, 1, 0],
2424
// CHECK-SAME: workgroup = [32, 32, 0]}>}
2525
%0 = linalg.matmul ins(%arg0, %arg1 : !TA, !TB) outs(%arg2 : !TC) -> !TC
2626
iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !TC -> !DTC
@@ -37,7 +37,7 @@ func.func @matmul_32_32_32(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
3737
// CHECK-SAME: workgroup_size = [256, 1, 1] subgroup_size = 64, {
3838
func.func @matmul_4096_4096_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
3939
// CHECK: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
40-
// CHECK-SAME: promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 4], subgroup = [4, 4, 0], workgroup = [128, 128, 0]}>
40+
// CHECK-SAME: promote_operands = [0, 1], reduction = [0, 0, 4], subgroup = [4, 4, 0], workgroup = [128, 128, 0]}>
4141
%0 = linalg.matmul ins(%arg0, %arg1 : !TA, !TB) outs(%arg2 : !TC) -> !TC
4242
iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !TC -> !DTC
4343
return
@@ -53,7 +53,7 @@ func.func @matmul_4096_4096_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC
5353
// CHECK-SAME: workgroup_size = [256, 1, 1] subgroup_size = 64, {
5454
func.func @matmul_4096_32_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
5555
// CHECK: #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
56-
// CHECK-SAME: promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 8], subgroup = [2, 4, 0],
56+
// CHECK-SAME: promote_operands = [0, 1], reduction = [0, 0, 8], subgroup = [2, 4, 0],
5757
// CHECK-SAME: workgroup = [64, 128, 0]}>}
5858
%0 = linalg.matmul ins(%arg0, %arg1 : !TA, !TB) outs(%arg2 : !TC) -> !TC
5959
iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !TC -> !DTC
@@ -71,7 +71,7 @@ func.func @matmul_4096_32_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC)
7171
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
7272
func.func @matmul_4096_1_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
7373
// CHECK: #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
74-
// CHECK-SAME: padding = [32, 32, 4], promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 1], subgroup = [1, 2, 0], workgroup = [32, 32, 0]}
74+
// CHECK-SAME: padding = [32, 32, 4], promote_operands = [0, 1], reduction = [0, 0, 1], subgroup = [1, 2, 0], workgroup = [32, 32, 0]}
7575
%0 = linalg.matmul ins(%arg0, %arg1 : !TA, !TB) outs(%arg2 : !TC) -> !TC
7676
iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !TC -> !DTC
7777
return

compiler/src/iree/compiler/Codegen/LLVMGPU/test/pipeline_coalesced_dma.mlir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
99
"rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
10-
arch = "gfx942", features = "", wgp = <
10+
arch = "gfx950", features = "", wgp = <
1111
compute = fp64|fp32|fp16|int64|int32|int16|int8,
1212
storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
1313
dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],
@@ -73,7 +73,7 @@ hal.executable public @coalesced_dma_to_lds {
7373

7474
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
7575
"rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
76-
arch = "gfx942", features = "", wgp = <
76+
arch = "gfx950", features = "", wgp = <
7777
compute = fp64|fp32|fp16|int64|int32|int16|int8,
7878
storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
7979
dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],
@@ -138,7 +138,7 @@ hal.executable public @coalesced_dma_matmul_operand {
138138

139139
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
140140
"rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
141-
arch = "gfx942", features = "", wgp = <
141+
arch = "gfx950", features = "", wgp = <
142142
compute = fp64|fp32|fp16|int64|int32|int16|int8,
143143
storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
144144
dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],
@@ -205,7 +205,7 @@ hal.executable public @coalesced_dma_f16 {
205205

206206
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
207207
"rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
208-
arch = "gfx942", features = "", wgp = <
208+
arch = "gfx950", features = "", wgp = <
209209
compute = fp64|fp32|fp16|int64|int32|int16|int8,
210210
storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
211211
dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],
@@ -289,7 +289,7 @@ hal.executable public @coalesced_dma_multi_transfer {
289289

290290
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
291291
"rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
292-
arch = "gfx942", features = "", wgp = <
292+
arch = "gfx950", features = "", wgp = <
293293
compute = fp64|fp32|fp16|int64|int32|int16|int8,
294294
storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
295295
dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],

0 commit comments

Comments
 (0)