Fix issues (but not complete)

lialan · lialan · commit be977e2a9f3e · 2026-01-23T08:13:53.000-08:00
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DebugLog.h"
 #include "llvm/Support/InterleavedRange.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/Attributes.h"
@@ -926,10 +927,22 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
       {"subgroup", b.getI64ArrayAttr(subgroupTileSizes)},
       {"mma_kind", kind}};
 
+  // Check if target supports global load DMA (gfx950+).
+  bool supportsGlobalLoadDMA = false;
+  StringRef targetArch = target.getArch();
+  if (auto maybeChipset = amdgpu::Chipset::parse(targetArch);
+      succeeded(maybeChipset)) {
+    constexpr amdgpu::Chipset kGfx950{9, 5, 0};
+    supportsGlobalLoadDMA = (*maybeChipset >= kGfx950);
+  }
+
   // Use global load DMA attribute (subgroup sizes will be derived from
-  // translation_info).
-  Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context);
-  SmallVector<Attribute> promotionArray = {useGlobalDma, useGlobalDma};
+  // translation_info) only on gfx950+.
+  SmallVector<Attribute> promotionArray;
+  if (supportsGlobalLoadDMA) {
+    Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context);
+    promotionArray = {useGlobalDma, useGlobalDma};
+  }
   SmallVector<int64_t> promotionList = {0, 1};
   if (scaled) {
     // TODO(#22119): We don't use global load DMA for scaled matmuls, because
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
@@ -40,7 +40,7 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
 // CHECK-SAME:       lowering_config = #[[$CONFIG]]
 //      CHECK:   ^bb
 //      CHECK:     linalg.matmul
-// CHECK-SAME:         lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 16], subgroup = [2, 4, 0], workgroup = [64, 128, 0]
+// CHECK-SAME:         lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 16], subgroup = [2, 4, 0], workgroup = [64, 128, 0]
 //      CHECK:   iree_linalg_ext.yield
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matmul.mlir
@@ -20,7 +20,7 @@ func.func @matmul_32_32_32(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
   // GENERALIZED:   linalg.generic
   // SPECIALIZED:   linalg.matmul
   //      CHECK:  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
-  // CHECK-SAME:  promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 8], subgroup = [1, 1, 0],
+  // CHECK-SAME:  promote_operands = [0, 1], reduction = [0, 0, 8], subgroup = [1, 1, 0],
   // CHECK-SAME:  workgroup = [32, 32, 0]}>}
   %0 = linalg.matmul ins(%arg0, %arg1 : !TA, !TB) outs(%arg2 : !TC) -> !TC
   iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !TC -> !DTC
@@ -37,7 +37,7 @@ func.func @matmul_32_32_32(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
 // CHECK-SAME:    workgroup_size = [256, 1, 1] subgroup_size = 64, {
 func.func @matmul_4096_4096_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
   //      CHECK: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
-  // CHECK-SAME: promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 4], subgroup = [4, 4, 0], workgroup = [128, 128, 0]}>
+  // CHECK-SAME: promote_operands = [0, 1], reduction = [0, 0, 4], subgroup = [4, 4, 0], workgroup = [128, 128, 0]}>
   %0 = linalg.matmul ins(%arg0, %arg1 : !TA, !TB) outs(%arg2 : !TC) -> !TC
   iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !TC -> !DTC
   return
@@ -53,7 +53,7 @@ func.func @matmul_4096_4096_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC
 // CHECK-SAME:    workgroup_size = [256, 1, 1] subgroup_size = 64, {
 func.func @matmul_4096_32_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
   //      CHECK:  #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
-  // CHECK-SAME:  promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 8], subgroup = [2, 4, 0],
+  // CHECK-SAME:  promote_operands = [0, 1], reduction = [0, 0, 8], subgroup = [2, 4, 0],
   // CHECK-SAME:  workgroup = [64, 128, 0]}>}
   %0 = linalg.matmul ins(%arg0, %arg1 : !TA, !TB) outs(%arg2 : !TC) -> !TC
   iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !TC -> !DTC
@@ -71,7 +71,7 @@ func.func @matmul_4096_32_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC)
 // CHECK-SAME:   {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
 func.func @matmul_4096_1_4096(%arg0: !TA, %arg1: !TB, %arg2: !TC, %arg3: !DTC) {
   //      CHECK: #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
-  // CHECK-SAME: padding = [32, 32, 4], promote_operands = [0, 1], promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma], reduction = [0, 0, 1], subgroup = [1, 2, 0], workgroup = [32, 32, 0]}
+  // CHECK-SAME: padding = [32, 32, 4], promote_operands = [0, 1], reduction = [0, 0, 1], subgroup = [1, 2, 0], workgroup = [32, 32, 0]}
   %0 = linalg.matmul ins(%arg0, %arg1 : !TA, !TB) outs(%arg2 : !TC) -> !TC
   iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !TC -> !DTC
   return
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pipeline_coalesced_dma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pipeline_coalesced_dma.mlir
@@ -7,7 +7,7 @@
 
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
   "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
-  arch = "gfx942", features = "", wgp = <
+  arch = "gfx950", features = "", wgp = <
     compute = fp64|fp32|fp16|int64|int32|int16|int8,
     storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
     dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],
@@ -73,7 +73,7 @@ hal.executable public @coalesced_dma_to_lds {
 
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
   "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
-  arch = "gfx942", features = "", wgp = <
+  arch = "gfx950", features = "", wgp = <
     compute = fp64|fp32|fp16|int64|int32|int16|int8,
     storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
     dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],
@@ -138,7 +138,7 @@ hal.executable public @coalesced_dma_matmul_operand {
 
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
   "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
-  arch = "gfx942", features = "", wgp = <
+  arch = "gfx950", features = "", wgp = <
     compute = fp64|fp32|fp16|int64|int32|int16|int8,
     storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
     dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],
@@ -205,7 +205,7 @@ hal.executable public @coalesced_dma_f16 {
 
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
   "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
-  arch = "gfx942", features = "", wgp = <
+  arch = "gfx950", features = "", wgp = <
     compute = fp64|fp32|fp16|int64|int32|int16|int8,
     storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
     dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],
@@ -289,7 +289,7 @@ hal.executable public @coalesced_dma_multi_transfer {
 
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
   "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
-  arch = "gfx942", features = "", wgp = <
+  arch = "gfx950", features = "", wgp = <
     compute = fp64|fp32|fp16|int64|int32|int16|int8,
     storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic,
     dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64, 64],