Add ChipDetails definition for MI350X and MI355X target. (iree-org#21690)

amd-eochoalo · keshavvinayak01 · commit a06a06859efe · 2025-09-04T18:04:07.000Z
Signed-off-by: Erick Ochoa &lt;erick.ochoalopez@amd.com&gt;
Signed-off-by: keshavvinayak01 &lt;keshavvinayakjha@gmail.com&gt;
diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir
@@ -9,7 +9,10 @@
 //
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
 // RUN:   --iree-hip-target=gfx950 %s | FileCheck %s --check-prefixes=GFX950
-//
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
+// RUN:   --iree-hip-target=mi350x %s | FileCheck %s --check-prefixes=GFX950,MI350X
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
+// RUN:   --iree-hip-target=mi355x %s | FileCheck %s --check-prefixes=GFX950,MI355X
 //
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
 // RUN:   --iree-hip-target=rx7900xtx %s | FileCheck %s --check-prefix=GFX1100
@@ -58,6 +61,8 @@
 // GFX950-SAME:         scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>],
 // GFX950-SAME:         subgroup_size_choices = [64],
 // GFX950-SAME:         max_workgroup_memory_bytes = 163840,
+// MI350X: chip = <wgp_count = 256, sku = "mi350x", memory_bandwidth_tbps = 8.000000e+00 : f32, perf_tflops = {fp16 = 2.300000e+03 : f32, fp32 = 1.442000e+02 : f32, fp4 = 9.200000e+03 : f32, fp6 = 9.200000e+03 : f32, fp8 = 4.600000e+03 : f32, int8 = 4.600000e+03 : f32}>>
+// MI355X: chip = <wgp_count = 256, sku = "mi355x", memory_bandwidth_tbps = 8.000000e+00 : f32, perf_tflops = {fp16 = 2.500000e+03 : f32, fp32 = 1.573000e+02 : f32, fp4 = 1.000000e+04 : f32, fp6 = 1.000000e+04 : f32, fp8 = 5.000000e+03 : f32, int8 = 5.000000e+03 : f32}>>
 
 // GFX1100: target_info = #iree_gpu.target<arch = "gfx1100",
 // GFX1100-SAME:        mma = [<WMMAR3_F32_16x16x16_F16>, <WMMAR3_F16_16x16x16_F16>, <WMMAR3_F32_16x16x16_BF16>, <WMMAR3_BF16_16x16x16_BF16>, <WMMAR3_I32_16x16x16_I8>]
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -33,13 +33,18 @@ def IREEGPU_CIBW_16 : I32BitEnumAttrCaseBit<"Int16", 5, "int16">;
 def IREEGPU_CIBW_8  : I32BitEnumAttrCaseBit<"Int8",  6, "int8">;
 // Generic 8-bit floating point format in computation
 def IREEGPU_CFBW_8  : I32BitEnumAttrCaseBit<"FP8",  7, "fp8">;
+// Generic 6-bit floating point format in computation
+def IREEGPU_CFBW_6  : I32BitEnumAttrCaseBit<"FP6",  8, "fp6">;
+// Generic 4-bit floating point format in computation
+def IREEGPU_CFBW_4  : I32BitEnumAttrCaseBit<"FP4",  9, "fp4">;
 
 
 def IREEGPU_ComputeBitwidths : I32BitEnumAttr<
   "ComputeBitwidths", "Supported bitwidths for compute",
   [IREEGPU_CFBW_64, IREEGPU_CFBW_32, IREEGPU_CFBW_16,
    IREEGPU_CIBW_64, IREEGPU_CIBW_32, IREEGPU_CIBW_16,
-   IREEGPU_CIBW_8,  IREEGPU_CFBW_8]> {
+   IREEGPU_CIBW_8, IREEGPU_CFBW_8, IREEGPU_CFBW_6,
+   IREEGPU_CFBW_4]> {
   let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
   let genSpecializedAttr = 0;
 }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -452,6 +452,27 @@ std::optional<TargetDetails> getAMDGPUTargetDetails(StringRef target) {
   const WgpDetails *rdna1Wgp = getRDNA1WgpDetails();
 
   // --- CDNA --- //
+  // "AMD Instinct MI350 Series Product Offerings" in Page 18 of
+  // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-4-architecture-whitepaper.pdf
+  static const ChipDetails mi350xChip = {256,
+                                         "mi350x",
+                                         8.0f,
+                                         {{ComputeBitwidths::FP32, 144.2f},
+                                          {ComputeBitwidths::FP16, 2300.0f},
+                                          {ComputeBitwidths::Int8, 4600.0f},
+                                          {ComputeBitwidths::FP8, 4600.0f},
+                                          {ComputeBitwidths::FP6, 9200.0f},
+                                          {ComputeBitwidths::FP4, 9200.0f}}};
+
+  static const ChipDetails mi355xChip = {256,
+                                         "mi355x",
+                                         8.0f,
+                                         {{ComputeBitwidths::FP32, 157.3f},
+                                          {ComputeBitwidths::FP16, 2500.0f},
+                                          {ComputeBitwidths::Int8, 5000.0f},
+                                          {ComputeBitwidths::FP8, 5000.0f},
+                                          {ComputeBitwidths::FP6, 10000.0f},
+                                          {ComputeBitwidths::FP4, 10000.0f}}};
 
   // "AMD Instinct MI300 Series Product Offerings" in Page 23 of
   // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
@@ -580,6 +601,8 @@ std::optional<TargetDetails> getAMDGPUTargetDetails(StringRef target) {
   // See https://llvm.org/docs/AMDGPUUsage.html#processors for gfxN to
   // cdnaN/rdnaN mapping.
   return llvm::StringSwitch<std::optional<TargetDetails>>(target.lower())
+      .Case("mi355x", TargetDetails{cdna4Wgp, &mi355xChip})
+      .Case("mi350x", TargetDetails{cdna4Wgp, &mi350xChip})
       .Cases("cdna4", "gfx950", TargetDetails{cdna4Wgp, nullptr})
       .Case("mi325x", TargetDetails{cdna3Wgp, &mi325xChip})
       .Case("mi300x", TargetDetails{cdna3Wgp, &mi300xChip})
@@ -636,6 +659,7 @@ StringRef normalizeAMDGPUTarget(StringRef target) {
     return target;
 
   return llvm::StringSwitch<StringRef>(target.lower())
+      .Cases("mi350x", "mi355x", "gfx950")
       .Cases("mi300a", "mi300x", "mi308x", "mi325x", "gfx942")
       .Cases("mi250x", "mi250", "mi210", "cdna2", "gfx90a")
       .Cases("mi100", "cdna1", "gfx908")
diff --git a/docs/website/docs/guides/deployment-configurations/gpu-rocm.md b/docs/website/docs/guides/deployment-configurations/gpu-rocm.md
@@ -193,6 +193,8 @@ architectures:
 | AMD MI300X               | `mi300x`    | `gfx942`            | `cdna3`                |
 | AMD MI308X               | `mi308x`    | `gfx942`            | `cdna3`                |
 | AMD MI325X               | `mi325x`    | `gfx942`            | `cdna3`                |
+| AMD MI350X               | `mi350x`    | `gfx950`            | `cdna4`                |
+| AMD MI355X               | `mi355x`    | `gfx950`            | `cdna4`                |
 | AMD PRO V710             | `v710`      | `gfx1101`           | `rdna3`                |
 | AMD PRO W7700            | `w7700`     | `gfx1101`           | `rdna3`                |
 | AMD PRO W7800            | `w7800`     | `gfx1100`           | `rdna3`                |