[Codegen][GPU] Use arithmetic intensity to guide gemm size categorization - Step 1 (iree-org#21638)

jerryyin · keshavvinayak01 · commit 44c89cf049e9 · 2025-09-04T18:04:07.000Z
This is the first PR to implement iree-org#21506 strategy 1 Subgroup favor for seed selection. This PR adds two optional fields to the chip attribute: - peakMemoryBandwidthTBs: Single float point to indicated the bandwidth in unit of tb/s - peakPerfTFLOPs: Dictionary with key being the data type and value being the tflops/s With the added data in MI100, MI200, MI300 and Navi4 arch, next PR can use the tflops and tbps to categorize the GEMMs into three buckets (small, medium and large) and set seeds which are winners from collection of 478 convolutions. --------- Signed-off-by: jerryyin <zhuoryin@amd.com> Signed-off-by: keshavvinayak01 <keshavvinayakjha@gmail.com>
diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir
@@ -48,10 +48,10 @@
 // GFX942-SAME:         subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024],
 // GFX942-SAME:         max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
 // GFX942-SAME:         max_workgroup_counts = [2147483647, 2147483647, 2147483647],
-// MI300X: chip = <wgp_count = 304, sku = "mi300x">>
-// MI300A: chip = <wgp_count = 228, sku = "mi300a">>
-// MI308X: chip = <wgp_count = 80, sku = "mi308x">>
-// MI325X: chip = <wgp_count = 304, sku = "mi325x">>
+// MI300X: chip = <wgp_count = 304, sku = "mi300x", memory_bandwidth_tbps = 5.300000e+00 : f32, perf_tflops = {fp16 = 1.307400e+03 : f32, fp32 = 1.634000e+02 : f32, fp8 = 2.614900e+03 : f32, int8 = 2.614900e+03 : f32}>>
+// MI300A: chip = <wgp_count = 228, sku = "mi300a", memory_bandwidth_tbps = 5.300000e+00 : f32, perf_tflops = {fp16 = 980.599975 : f32, fp32 = 1.226000e+02 : f32, fp8 = 1.961200e+03 : f32, int8 = 1.961200e+03 : f32}>>
+// MI308X: chip = <wgp_count = 80, sku = "mi308x", memory_bandwidth_tbps = 5.300000e+00 : f32, perf_tflops = {fp16 = 1.884000e+02 : f32, fp32 = 2.900000e+01 : f32, fp8 = 1.768000e+02 : f32, int8 = 1.768000e+02 : f32}>>
+// MI325X: chip = <wgp_count = 304, sku = "mi325x", memory_bandwidth_tbps = 5.300000e+00 : f32, perf_tflops = {fp16 = 1.307400e+03 : f32, fp32 = 1.634000e+02 : f32, fp8 = 2.614900e+03 : f32, int8 = 2.614900e+03 : f32}>>
 
 // GFX950: target_info = #iree_gpu.target<arch = "gfx950",
 // GFX950-SAME:         mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>],
@@ -71,15 +71,15 @@
 // GFX1200-SAME:        mma = [<WMMAR4_F32_16x16x16_F16>, <WMMAR4_F16_16x16x16_F16>, <WMMAR4_F32_16x16x16_BF16>, <WMMAR4_BF16_16x16x16_BF16>, <WMMAR4_F32_16x16x16_F8E5M2>, <WMMAR4_F32_16x16x16_F8E5M2_F8E4M3FN>, <WMMAR4_F32_16x16x16_F8E4M3FN>, <WMMAR4_F32_16x16x16_F8E4M3FN_F8E5M2>,  <WMMAR4_I32_16x16x16_I8>]
 // GFX1200-SAME:        subgroup_size_choices = [32, 64]
 //
-// RX9060XT: chip = <wgp_count = 16, sku = "rx9060xt">>
+// RX9060XT: chip = <wgp_count = 16, sku = "rx9060xt", memory_bandwidth_tbps = 3.200000e-01 : f32, perf_tflops = {fp16 = 1.030000e+02 : f32, fp32 = 2.560000e+01 : f32, fp8 = 2.050000e+02 : f32, int8 = 2.050000e+02 : f32}>>
 
 // GFX1201: target_info = #iree_gpu.target<arch = "gfx1201",
 // GFX1201-SAME:        mma = [<WMMAR4_F32_16x16x16_F16>, <WMMAR4_F16_16x16x16_F16>, <WMMAR4_F32_16x16x16_BF16>, <WMMAR4_BF16_16x16x16_BF16>, <WMMAR4_F32_16x16x16_F8E5M2>, <WMMAR4_F32_16x16x16_F8E5M2_F8E4M3FN>, <WMMAR4_F32_16x16x16_F8E4M3FN>, <WMMAR4_F32_16x16x16_F8E4M3FN_F8E5M2>,  <WMMAR4_I32_16x16x16_I8>]
 // GFX1201-SAME:        subgroup_size_choices = [32, 64]
 //
-// RX9070XT: chip = <wgp_count = 32, sku = "rx9070xt">>
-// RX9070:   chip = <wgp_count = 28, sku = "rx9070">>
-// R9700:    chip = <wgp_count = 32, sku = "r9700">>
+// RX9070XT: chip = <wgp_count = 32, sku = "rx9070xt", memory_bandwidth_tbps = 6.400000e-01 : f32, perf_tflops = {fp16 = 1.950000e+02 : f32, fp32 = 4.870000e+01 : f32, fp8 = 3.890000e+02 : f32, int8 = 3.890000e+02 : f32}>>
+// RX9070:   chip = <wgp_count = 28, sku = "rx9070", memory_bandwidth_tbps = 6.400000e-01 : f32, perf_tflops = {fp16 = 1.450000e+02 : f32, fp32 = 3.610000e+01 : f32, fp8 = 2.890000e+02 : f32, int8 = 2.890000e+02 : f32}>>
+// R9700:    chip = <wgp_count = 32, sku = "r9700", memory_bandwidth_tbps = 6.400000e-01 : f32, perf_tflops = {fp16 = 1.910000e+02 : f32, fp32 = 4.780000e+01 : f32, fp8 = 3.830000e+02 : f32, int8 = 3.830000e+02 : f32}>>
 
 stream.executable public @reduce_dispatch {
   stream.executable.export @reduce_dispatch workgroups(%arg0: index) -> (index, index, index) {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -567,6 +567,10 @@ def IREEGPU_TargetChipAttr : AttrDef<IREEGPU_Dialect, "TargetChip"> {
 
     // An optional SKU identifier to distinguish different models.
     OptionalParameter<"StringAttr">:$sku,
+    // An optional memory bandwidth in TB/s.
+    OptionalParameter<"FloatAttr">:$memory_bandwidth_tbps,
+    // An optional performance dictionary in TFLOPS.
+    OptionalParameter<"DictionaryAttr">:$perf_tflops,
     // An optional extra dict
     // This field allows to inject more features/limits not supported in the
     // above list for better flexibility.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -31,11 +31,15 @@ def IREEGPU_CIBW_32 : I32BitEnumAttrCaseBit<"Int32", 4, "int32">;
 def IREEGPU_CIBW_16 : I32BitEnumAttrCaseBit<"Int16", 5, "int16">;
 // Signed/unsigned 8-bit integer format in computation
 def IREEGPU_CIBW_8  : I32BitEnumAttrCaseBit<"Int8",  6, "int8">;
+// Generic 8-bit floating point format in computation
+def IREEGPU_CFBW_8  : I32BitEnumAttrCaseBit<"FP8",  7, "fp8">;
+
 
 def IREEGPU_ComputeBitwidths : I32BitEnumAttr<
   "ComputeBitwidths", "Supported bitwidths for compute",
   [IREEGPU_CFBW_64, IREEGPU_CFBW_32, IREEGPU_CFBW_16,
-   IREEGPU_CIBW_64, IREEGPU_CIBW_32, IREEGPU_CIBW_16, IREEGPU_CIBW_8]> {
+   IREEGPU_CIBW_64, IREEGPU_CIBW_32, IREEGPU_CIBW_16,
+   IREEGPU_CIBW_8,  IREEGPU_CFBW_8]> {
   let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
   let genSpecializedAttr = 0;
 }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -9,6 +9,7 @@
 #include <optional>
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "mlir/IR/Attributes.h"
@@ -61,6 +62,18 @@ struct WgpDetails {
 struct ChipDetails {
   uint32_t wgpCount;
   std::optional<StringRef> sku;
+  // Aggregate chip-level bandwidth in TB/s.
+  std::optional<float> peakMemoryBandwidthTBs;
+  // Optional per-data-type compute performance (TFLOPs/s).
+  llvm::SmallDenseMap<ComputeBitwidths, float> peakPerfTFLOPs;
+
+  ChipDetails(
+      uint32_t wgp, std::optional<llvm::StringRef> s = std::nullopt,
+      std::optional<float> bw = std::nullopt,
+      std::initializer_list<llvm::detail::DenseMapPair<ComputeBitwidths, float>>
+          perf = {})
+      : wgpCount(wgp), sku(s), peakMemoryBandwidthTBs(bw),
+        peakPerfTFLOPs(perf) {}
 };
 
 // Full target details
@@ -139,8 +152,26 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch,
     auto skuAttr = details.chip->sku
                        ? StringAttr::get(context, *details.chip->sku)
                        : StringAttr{};
+
+    FloatAttr peakMemoryBandwidthAttr =
+        details.chip->peakMemoryBandwidthTBs
+            ? FloatAttr::get(Float32Type::get(context),
+                             *details.chip->peakMemoryBandwidthTBs)
+            : FloatAttr{};
+
+    DictionaryAttr peakPerfTFLOPsAttr = {};
+    if (!details.chip->peakPerfTFLOPs.empty()) {
+      SmallVector<NamedAttribute> attributes = llvm::map_to_vector(
+          details.chip->peakPerfTFLOPs, [&](const auto &pair) {
+            return NamedAttribute(
+                stringifyComputeBitwidths(pair.first),
+                FloatAttr::get(Float32Type::get(context), pair.second));
+          });
+      peakPerfTFLOPsAttr = DictionaryAttr::get(context, attributes);
+    }
     targetChip = TargetChipAttr::get(context, details.chip->wgpCount, skuAttr,
-                                     DictionaryAttr{});
+                                     peakMemoryBandwidthAttr,
+                                     peakPerfTFLOPsAttr, DictionaryAttr{});
   }
 
   return TargetAttr::get(context, arch, features, targetWgp, targetChip);
@@ -424,20 +455,73 @@ std::optional<TargetDetails> getAMDGPUTargetDetails(StringRef target) {
 
   // "AMD Instinct MI300 Series Product Offerings" in Page 23 of
   // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
-  static const ChipDetails mi300xChip = {304, "mi300x"};
-  static const ChipDetails mi300aChip = {228, "mi300a"};
-  static const ChipDetails mi308xChip = {80, "mi308x"};
-  static const ChipDetails mi325xChip = {304, "mi325x"};
+  static const ChipDetails mi300xChip = {304,
+                                         "mi300x",
+                                         5.3f,
+                                         {{ComputeBitwidths::FP32, 163.4f},
+                                          {ComputeBitwidths::FP16, 1307.4f},
+                                          {ComputeBitwidths::Int8, 2614.9f},
+                                          {ComputeBitwidths::FP8, 2614.9f}}};
+
+  static const ChipDetails mi300aChip = {228,
+                                         "mi300a",
+                                         5.3f,
+                                         {{ComputeBitwidths::FP32, 122.6f},
+                                          {ComputeBitwidths::FP16, 980.6f},
+                                          {ComputeBitwidths::Int8, 1961.2f},
+                                          {ComputeBitwidths::FP8, 1961.2f}}};
+
+  static const ChipDetails mi308xChip = {
+      80,
+      "mi308x",
+      5.3f,
+      // Peak fp32 perf estimated from:
+      // 80(CUs)*4(SIMDs)*1.42(Freq)*(16*16*4)(GEMM shape)*2(mul+add)/32(latency
+      // instruction)
+      {{ComputeBitwidths::FP32, 29.0f},
+       {ComputeBitwidths::FP16, 188.4f},
+       {ComputeBitwidths::FP8, 176.8f},
+       // Estimated int8 performance based on FP8
+       {ComputeBitwidths::Int8, 176.8f}}};
+
+  static const ChipDetails mi325xChip = {304,
+                                         "mi325x",
+                                         5.3f,
+                                         {{ComputeBitwidths::FP32, 163.4f},
+                                          {ComputeBitwidths::FP16, 1307.4f},
+                                          {ComputeBitwidths::Int8, 2614.9f},
+                                          {ComputeBitwidths::FP8, 2614.9f}}};
 
   // "AMD Instinct MI200 Series Accelerator Product Offerings" in Page 14 of
   // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
-  static const ChipDetails mi250xChip = {220, "mi250x"};
-  static const ChipDetails mi250Chip = {208, "mi250"};
-  static const ChipDetails mi210Chip = {104, "mi210"};
+  static const ChipDetails mi250xChip = {220,
+                                         "mi250x",
+                                         3.2f,
+                                         {{ComputeBitwidths::FP32, 95.7f},
+                                          {ComputeBitwidths::FP16, 383.0f},
+                                          {ComputeBitwidths::Int8, 383.0f}}};
+
+  static const ChipDetails mi250Chip = {208,
+                                        "mi250",
+                                        3.2f,
+                                        {{ComputeBitwidths::FP32, 90.5f},
+                                         {ComputeBitwidths::FP16, 362.1f},
+                                         {ComputeBitwidths::Int8, 362.1f}}};
+  static const ChipDetails mi210Chip = {104,
+                                        "mi210",
+                                        1.6f,
+                                        {{ComputeBitwidths::FP32, 45.3f},
+                                         {ComputeBitwidths::FP16, 181.0f},
+                                         {ComputeBitwidths::Int8, 181.0f}}};
 
   // "AMD CDNA Architecture Compute Units" in Page 5 of
   // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
-  static const ChipDetails mi100Chip = {120, "mi100"};
+  static const ChipDetails mi100Chip = {120,
+                                        "mi100",
+                                        1.23f,
+                                        {{ComputeBitwidths::FP32, 46.1f},
+                                         {ComputeBitwidths::FP16, 184.6f},
+                                         {ComputeBitwidths::Int8, 184.6f}}};
 
   // --- RDNA --- //
 
@@ -450,10 +534,38 @@ std::optional<TargetDetails> getAMDGPUTargetDetails(StringRef target) {
 
   // AMD RDNA4 architecture:
   // https://www.amd.com/en/newsroom/press-releases/2025-2-28-amd-unveils-next-generation-amd-rdna-4-architectu.html.
-  static const ChipDetails r9700Chip = {64 / 2, "r9700"};
-  static const ChipDetails rx9070xtChip = {64 / 2, "rx9070xt"};
-  static const ChipDetails rx9070Chip = {56 / 2, "rx9070"};
-  static const ChipDetails rx9060xtChip = {32 / 2, "rx9060xt"};
+  // https://www.amd.com/en/products/graphics/workstations/radeon-ai-pro/ai-9000-series/amd-radeon-ai-pro-r9700.html
+  static const ChipDetails r9700Chip = {64 / 2,
+                                        "r9700",
+                                        0.64f,
+                                        {{ComputeBitwidths::FP32, 47.8f},
+                                         {ComputeBitwidths::FP16, 191.0f},
+                                         {ComputeBitwidths::Int8, 383.0f},
+                                         {ComputeBitwidths::FP8, 383.0f}}};
+  // https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9070xt.html
+  static const ChipDetails rx9070xtChip = {64 / 2,
+                                           "rx9070xt",
+                                           0.64f,
+                                           {{ComputeBitwidths::FP32, 48.7f},
+                                            {ComputeBitwidths::FP16, 195.0f},
+                                            {ComputeBitwidths::Int8, 389.0f},
+                                            {ComputeBitwidths::FP8, 389.0f}}};
+  // https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9070.html
+  static const ChipDetails rx9070Chip = {56 / 2,
+                                         "rx9070",
+                                         0.64f,
+                                         {{ComputeBitwidths::FP32, 36.1f},
+                                          {ComputeBitwidths::FP16, 145.0f},
+                                          {ComputeBitwidths::Int8, 289.0f},
+                                          {ComputeBitwidths::FP8, 289.0f}}};
+  // https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9060xt.html
+  static const ChipDetails rx9060xtChip = {32 / 2,
+                                           "rx9060xt",
+                                           0.32f,
+                                           {{ComputeBitwidths::FP32, 25.6f},
+                                            {ComputeBitwidths::FP16, 103.0f},
+                                            {ComputeBitwidths::Int8, 205.0f},
+                                            {ComputeBitwidths::FP8, 205.0f}}};
 
   // AMD RDNA3.
   static const ChipDetails rx7900xtxChip = {96 / 2, "rx7900xtx"};