Skip to content

Commit 44c89cf

Browse files
jerryyinkeshavvinayak01
authored andcommitted
[Codegen][GPU] Use arithmetic intensity to guide gemm size categorization - Step 1 (iree-org#21638)
This is the first PR to implement iree-org#21506 strategy 1 Subgroup favor for seed selection. This PR adds two optional fields to the chip attribute: - peakMemoryBandwidthTBs: Single float point to indicated the bandwidth in unit of tb/s - peakPerfTFLOPs: Dictionary with key being the data type and value being the tflops/s With the added data in MI100, MI200, MI300 and Navi4 arch, next PR can use the tflops and tbps to categorize the GEMMs into three buckets (small, medium and large) and set seeds which are winners from collection of 478 convolutions. --------- Signed-off-by: jerryyin <[email protected]> Signed-off-by: keshavvinayak01 <[email protected]>
1 parent 589962b commit 44c89cf

File tree

4 files changed

+142
-22
lines changed

4 files changed

+142
-22
lines changed

compiler/plugins/target/ROCM/test/target_device_features.mlir

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@
4848
// GFX942-SAME: subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024],
4949
// GFX942-SAME: max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
5050
// GFX942-SAME: max_workgroup_counts = [2147483647, 2147483647, 2147483647],
51-
// MI300X: chip = <wgp_count = 304, sku = "mi300x">>
52-
// MI300A: chip = <wgp_count = 228, sku = "mi300a">>
53-
// MI308X: chip = <wgp_count = 80, sku = "mi308x">>
54-
// MI325X: chip = <wgp_count = 304, sku = "mi325x">>
51+
// MI300X: chip = <wgp_count = 304, sku = "mi300x", memory_bandwidth_tbps = 5.300000e+00 : f32, perf_tflops = {fp16 = 1.307400e+03 : f32, fp32 = 1.634000e+02 : f32, fp8 = 2.614900e+03 : f32, int8 = 2.614900e+03 : f32}>>
52+
// MI300A: chip = <wgp_count = 228, sku = "mi300a", memory_bandwidth_tbps = 5.300000e+00 : f32, perf_tflops = {fp16 = 980.599975 : f32, fp32 = 1.226000e+02 : f32, fp8 = 1.961200e+03 : f32, int8 = 1.961200e+03 : f32}>>
53+
// MI308X: chip = <wgp_count = 80, sku = "mi308x", memory_bandwidth_tbps = 5.300000e+00 : f32, perf_tflops = {fp16 = 1.884000e+02 : f32, fp32 = 2.900000e+01 : f32, fp8 = 1.768000e+02 : f32, int8 = 1.768000e+02 : f32}>>
54+
// MI325X: chip = <wgp_count = 304, sku = "mi325x", memory_bandwidth_tbps = 5.300000e+00 : f32, perf_tflops = {fp16 = 1.307400e+03 : f32, fp32 = 1.634000e+02 : f32, fp8 = 2.614900e+03 : f32, int8 = 2.614900e+03 : f32}>>
5555

5656
// GFX950: target_info = #iree_gpu.target<arch = "gfx950",
5757
// GFX950-SAME: mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>],
@@ -71,15 +71,15 @@
7171
// GFX1200-SAME: mma = [<WMMAR4_F32_16x16x16_F16>, <WMMAR4_F16_16x16x16_F16>, <WMMAR4_F32_16x16x16_BF16>, <WMMAR4_BF16_16x16x16_BF16>, <WMMAR4_F32_16x16x16_F8E5M2>, <WMMAR4_F32_16x16x16_F8E5M2_F8E4M3FN>, <WMMAR4_F32_16x16x16_F8E4M3FN>, <WMMAR4_F32_16x16x16_F8E4M3FN_F8E5M2>, <WMMAR4_I32_16x16x16_I8>]
7272
// GFX1200-SAME: subgroup_size_choices = [32, 64]
7373
//
74-
// RX9060XT: chip = <wgp_count = 16, sku = "rx9060xt">>
74+
// RX9060XT: chip = <wgp_count = 16, sku = "rx9060xt", memory_bandwidth_tbps = 3.200000e-01 : f32, perf_tflops = {fp16 = 1.030000e+02 : f32, fp32 = 2.560000e+01 : f32, fp8 = 2.050000e+02 : f32, int8 = 2.050000e+02 : f32}>>
7575

7676
// GFX1201: target_info = #iree_gpu.target<arch = "gfx1201",
7777
// GFX1201-SAME: mma = [<WMMAR4_F32_16x16x16_F16>, <WMMAR4_F16_16x16x16_F16>, <WMMAR4_F32_16x16x16_BF16>, <WMMAR4_BF16_16x16x16_BF16>, <WMMAR4_F32_16x16x16_F8E5M2>, <WMMAR4_F32_16x16x16_F8E5M2_F8E4M3FN>, <WMMAR4_F32_16x16x16_F8E4M3FN>, <WMMAR4_F32_16x16x16_F8E4M3FN_F8E5M2>, <WMMAR4_I32_16x16x16_I8>]
7878
// GFX1201-SAME: subgroup_size_choices = [32, 64]
7979
//
80-
// RX9070XT: chip = <wgp_count = 32, sku = "rx9070xt">>
81-
// RX9070: chip = <wgp_count = 28, sku = "rx9070">>
82-
// R9700: chip = <wgp_count = 32, sku = "r9700">>
80+
// RX9070XT: chip = <wgp_count = 32, sku = "rx9070xt", memory_bandwidth_tbps = 6.400000e-01 : f32, perf_tflops = {fp16 = 1.950000e+02 : f32, fp32 = 4.870000e+01 : f32, fp8 = 3.890000e+02 : f32, int8 = 3.890000e+02 : f32}>>
81+
// RX9070: chip = <wgp_count = 28, sku = "rx9070", memory_bandwidth_tbps = 6.400000e-01 : f32, perf_tflops = {fp16 = 1.450000e+02 : f32, fp32 = 3.610000e+01 : f32, fp8 = 2.890000e+02 : f32, int8 = 2.890000e+02 : f32}>>
82+
// R9700: chip = <wgp_count = 32, sku = "r9700", memory_bandwidth_tbps = 6.400000e-01 : f32, perf_tflops = {fp16 = 1.910000e+02 : f32, fp32 = 4.780000e+01 : f32, fp8 = 3.830000e+02 : f32, int8 = 3.830000e+02 : f32}>>
8383

8484
stream.executable public @reduce_dispatch {
8585
stream.executable.export @reduce_dispatch workgroups(%arg0: index) -> (index, index, index) {

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,10 @@ def IREEGPU_TargetChipAttr : AttrDef<IREEGPU_Dialect, "TargetChip"> {
567567

568568
// An optional SKU identifier to distinguish different models.
569569
OptionalParameter<"StringAttr">:$sku,
570+
// An optional memory bandwidth in TB/s.
571+
OptionalParameter<"FloatAttr">:$memory_bandwidth_tbps,
572+
// An optional performance dictionary in TFLOPS.
573+
OptionalParameter<"DictionaryAttr">:$perf_tflops,
570574
// An optional extra dict
571575
// This field allows to inject more features/limits not supported in the
572576
// above list for better flexibility.

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,15 @@ def IREEGPU_CIBW_32 : I32BitEnumAttrCaseBit<"Int32", 4, "int32">;
3131
def IREEGPU_CIBW_16 : I32BitEnumAttrCaseBit<"Int16", 5, "int16">;
3232
// Signed/unsigned 8-bit integer format in computation
3333
def IREEGPU_CIBW_8 : I32BitEnumAttrCaseBit<"Int8", 6, "int8">;
34+
// Generic 8-bit floating point format in computation
35+
def IREEGPU_CFBW_8 : I32BitEnumAttrCaseBit<"FP8", 7, "fp8">;
36+
3437

3538
def IREEGPU_ComputeBitwidths : I32BitEnumAttr<
3639
"ComputeBitwidths", "Supported bitwidths for compute",
3740
[IREEGPU_CFBW_64, IREEGPU_CFBW_32, IREEGPU_CFBW_16,
38-
IREEGPU_CIBW_64, IREEGPU_CIBW_32, IREEGPU_CIBW_16, IREEGPU_CIBW_8]> {
41+
IREEGPU_CIBW_64, IREEGPU_CIBW_32, IREEGPU_CIBW_16,
42+
IREEGPU_CIBW_8, IREEGPU_CFBW_8]> {
3943
let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
4044
let genSpecializedAttr = 0;
4145
}

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp

Lines changed: 125 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <optional>
1010
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
1111
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
12+
#include "llvm/ADT/DenseMap.h"
1213
#include "llvm/ADT/STLExtras.h"
1314
#include "llvm/ADT/StringSwitch.h"
1415
#include "mlir/IR/Attributes.h"
@@ -61,6 +62,18 @@ struct WgpDetails {
6162
struct ChipDetails {
6263
uint32_t wgpCount;
6364
std::optional<StringRef> sku;
65+
// Aggregate chip-level bandwidth in TB/s.
66+
std::optional<float> peakMemoryBandwidthTBs;
67+
// Optional per-data-type compute performance (TFLOPs/s).
68+
llvm::SmallDenseMap<ComputeBitwidths, float> peakPerfTFLOPs;
69+
70+
ChipDetails(
71+
uint32_t wgp, std::optional<llvm::StringRef> s = std::nullopt,
72+
std::optional<float> bw = std::nullopt,
73+
std::initializer_list<llvm::detail::DenseMapPair<ComputeBitwidths, float>>
74+
perf = {})
75+
: wgpCount(wgp), sku(s), peakMemoryBandwidthTBs(bw),
76+
peakPerfTFLOPs(perf) {}
6477
};
6578

6679
// Full target details
@@ -139,8 +152,26 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch,
139152
auto skuAttr = details.chip->sku
140153
? StringAttr::get(context, *details.chip->sku)
141154
: StringAttr{};
155+
156+
FloatAttr peakMemoryBandwidthAttr =
157+
details.chip->peakMemoryBandwidthTBs
158+
? FloatAttr::get(Float32Type::get(context),
159+
*details.chip->peakMemoryBandwidthTBs)
160+
: FloatAttr{};
161+
162+
DictionaryAttr peakPerfTFLOPsAttr = {};
163+
if (!details.chip->peakPerfTFLOPs.empty()) {
164+
SmallVector<NamedAttribute> attributes = llvm::map_to_vector(
165+
details.chip->peakPerfTFLOPs, [&](const auto &pair) {
166+
return NamedAttribute(
167+
stringifyComputeBitwidths(pair.first),
168+
FloatAttr::get(Float32Type::get(context), pair.second));
169+
});
170+
peakPerfTFLOPsAttr = DictionaryAttr::get(context, attributes);
171+
}
142172
targetChip = TargetChipAttr::get(context, details.chip->wgpCount, skuAttr,
143-
DictionaryAttr{});
173+
peakMemoryBandwidthAttr,
174+
peakPerfTFLOPsAttr, DictionaryAttr{});
144175
}
145176

146177
return TargetAttr::get(context, arch, features, targetWgp, targetChip);
@@ -424,20 +455,73 @@ std::optional<TargetDetails> getAMDGPUTargetDetails(StringRef target) {
424455

425456
// "AMD Instinct MI300 Series Product Offerings" in Page 23 of
426457
// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
427-
static const ChipDetails mi300xChip = {304, "mi300x"};
428-
static const ChipDetails mi300aChip = {228, "mi300a"};
429-
static const ChipDetails mi308xChip = {80, "mi308x"};
430-
static const ChipDetails mi325xChip = {304, "mi325x"};
458+
static const ChipDetails mi300xChip = {304,
459+
"mi300x",
460+
5.3f,
461+
{{ComputeBitwidths::FP32, 163.4f},
462+
{ComputeBitwidths::FP16, 1307.4f},
463+
{ComputeBitwidths::Int8, 2614.9f},
464+
{ComputeBitwidths::FP8, 2614.9f}}};
465+
466+
static const ChipDetails mi300aChip = {228,
467+
"mi300a",
468+
5.3f,
469+
{{ComputeBitwidths::FP32, 122.6f},
470+
{ComputeBitwidths::FP16, 980.6f},
471+
{ComputeBitwidths::Int8, 1961.2f},
472+
{ComputeBitwidths::FP8, 1961.2f}}};
473+
474+
static const ChipDetails mi308xChip = {
475+
80,
476+
"mi308x",
477+
5.3f,
478+
// Peak fp32 perf estimated from:
479+
// 80(CUs)*4(SIMDs)*1.42(Freq)*(16*16*4)(GEMM shape)*2(mul+add)/32(latency
480+
// instruction)
481+
{{ComputeBitwidths::FP32, 29.0f},
482+
{ComputeBitwidths::FP16, 188.4f},
483+
{ComputeBitwidths::FP8, 176.8f},
484+
// Estimated int8 performance based on FP8
485+
{ComputeBitwidths::Int8, 176.8f}}};
486+
487+
static const ChipDetails mi325xChip = {304,
488+
"mi325x",
489+
5.3f,
490+
{{ComputeBitwidths::FP32, 163.4f},
491+
{ComputeBitwidths::FP16, 1307.4f},
492+
{ComputeBitwidths::Int8, 2614.9f},
493+
{ComputeBitwidths::FP8, 2614.9f}}};
431494

432495
// "AMD Instinct MI200 Series Accelerator Product Offerings" in Page 14 of
433496
// https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
434-
static const ChipDetails mi250xChip = {220, "mi250x"};
435-
static const ChipDetails mi250Chip = {208, "mi250"};
436-
static const ChipDetails mi210Chip = {104, "mi210"};
497+
static const ChipDetails mi250xChip = {220,
498+
"mi250x",
499+
3.2f,
500+
{{ComputeBitwidths::FP32, 95.7f},
501+
{ComputeBitwidths::FP16, 383.0f},
502+
{ComputeBitwidths::Int8, 383.0f}}};
503+
504+
static const ChipDetails mi250Chip = {208,
505+
"mi250",
506+
3.2f,
507+
{{ComputeBitwidths::FP32, 90.5f},
508+
{ComputeBitwidths::FP16, 362.1f},
509+
{ComputeBitwidths::Int8, 362.1f}}};
510+
static const ChipDetails mi210Chip = {104,
511+
"mi210",
512+
1.6f,
513+
{{ComputeBitwidths::FP32, 45.3f},
514+
{ComputeBitwidths::FP16, 181.0f},
515+
{ComputeBitwidths::Int8, 181.0f}}};
437516

438517
// "AMD CDNA Architecture Compute Units" in Page 5 of
439518
// https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
440-
static const ChipDetails mi100Chip = {120, "mi100"};
519+
static const ChipDetails mi100Chip = {120,
520+
"mi100",
521+
1.23f,
522+
{{ComputeBitwidths::FP32, 46.1f},
523+
{ComputeBitwidths::FP16, 184.6f},
524+
{ComputeBitwidths::Int8, 184.6f}}};
441525

442526
// --- RDNA --- //
443527

@@ -450,10 +534,38 @@ std::optional<TargetDetails> getAMDGPUTargetDetails(StringRef target) {
450534

451535
// AMD RDNA4 architecture:
452536
// https://www.amd.com/en/newsroom/press-releases/2025-2-28-amd-unveils-next-generation-amd-rdna-4-architectu.html.
453-
static const ChipDetails r9700Chip = {64 / 2, "r9700"};
454-
static const ChipDetails rx9070xtChip = {64 / 2, "rx9070xt"};
455-
static const ChipDetails rx9070Chip = {56 / 2, "rx9070"};
456-
static const ChipDetails rx9060xtChip = {32 / 2, "rx9060xt"};
537+
// https://www.amd.com/en/products/graphics/workstations/radeon-ai-pro/ai-9000-series/amd-radeon-ai-pro-r9700.html
538+
static const ChipDetails r9700Chip = {64 / 2,
539+
"r9700",
540+
0.64f,
541+
{{ComputeBitwidths::FP32, 47.8f},
542+
{ComputeBitwidths::FP16, 191.0f},
543+
{ComputeBitwidths::Int8, 383.0f},
544+
{ComputeBitwidths::FP8, 383.0f}}};
545+
// https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9070xt.html
546+
static const ChipDetails rx9070xtChip = {64 / 2,
547+
"rx9070xt",
548+
0.64f,
549+
{{ComputeBitwidths::FP32, 48.7f},
550+
{ComputeBitwidths::FP16, 195.0f},
551+
{ComputeBitwidths::Int8, 389.0f},
552+
{ComputeBitwidths::FP8, 389.0f}}};
553+
// https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9070.html
554+
static const ChipDetails rx9070Chip = {56 / 2,
555+
"rx9070",
556+
0.64f,
557+
{{ComputeBitwidths::FP32, 36.1f},
558+
{ComputeBitwidths::FP16, 145.0f},
559+
{ComputeBitwidths::Int8, 289.0f},
560+
{ComputeBitwidths::FP8, 289.0f}}};
561+
// https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9060xt.html
562+
static const ChipDetails rx9060xtChip = {32 / 2,
563+
"rx9060xt",
564+
0.32f,
565+
{{ComputeBitwidths::FP32, 25.6f},
566+
{ComputeBitwidths::FP16, 103.0f},
567+
{ComputeBitwidths::Int8, 205.0f},
568+
{ComputeBitwidths::FP8, 205.0f}}};
457569

458570
// AMD RDNA3.
459571
static const ChipDetails rx7900xtxChip = {96 / 2, "rx7900xtx"};

0 commit comments

Comments
 (0)