add double rate insns

umangyadav · umangyadav · commit c2ec6f0b2524 · 2025-06-19T13:29:21.000Z
diff --git a/mlir/lib/Dialect/Rock/IR/MfmaInsnGroup.cpp b/mlir/lib/Dialect/Rock/IR/MfmaInsnGroup.cpp
@@ -17,8 +17,8 @@ using namespace mlir;
 using namespace mlir::rock;
 
 // The static initialization will follow the defined ordering
-// of the below lambdas
-auto getMfmaInsnInfoMap = []() -> const llvm::StringMap<MfmaInsnInfo> & {
+// of the below lambda
+static auto getMfmaInsnInfoMap = []() -> const llvm::StringMap<MfmaInsnInfo> & {
   static llvm::StringMap<MfmaInsnInfo> insnInfo{
       // fp32
       {ROCDL::mfma_f32_32x32x1f32::getOperationName(),
@@ -37,8 +37,12 @@ auto getMfmaInsnInfoMap = []() -> const llvm::StringMap<MfmaInsnInfo> & {
        {MfmaTypeId::Fp16TyId, 32, 4, 2}},
       {ROCDL::mfma_f32_32x32x8f16::getOperationName(),
        {MfmaTypeId::Fp16TyId, 32, 8, 1}},
+      {ROCDL::mfma_f32_32x32x16_f16::getOperationName(),
+       {MfmaTypeId::Fp16TyId, 32, 16, 1}},
       {ROCDL::mfma_f32_16x16x4f16::getOperationName(),
        {MfmaTypeId::Fp16TyId, 16, 4, 4}},
+      {ROCDL::mfma_f32_16x16x32_f16::getOperationName(),
+       {MfmaTypeId::Fp16TyId, 16, 32, 1}},
       {ROCDL::mfma_f32_16x16x16f16::getOperationName(),
        {MfmaTypeId::Fp16TyId, 16, 16, 1}},
       {ROCDL::mfma_f32_4x4x4f16::getOperationName(),
@@ -47,10 +51,14 @@ auto getMfmaInsnInfoMap = []() -> const llvm::StringMap<MfmaInsnInfo> & {
       // bf16
       {ROCDL::mfma_f32_32x32x2bf16::getOperationName(),
        {MfmaTypeId::Bf16TyId, 32, 2, 2}},
+      {ROCDL::mfma_f32_32x32x16_bf16::getOperationName(),
+       {MfmaTypeId::Bf16TyId, 32, 16, 1}},
       {ROCDL::mfma_f32_32x32x4bf16::getOperationName(),
        {MfmaTypeId::Bf16TyId, 32, 4, 1}},
       {ROCDL::mfma_f32_16x16x2bf16::getOperationName(),
        {MfmaTypeId::Bf16TyId, 16, 2, 4}},
+      {ROCDL::mfma_f32_16x16x32_bf16::getOperationName(),
+       {MfmaTypeId::Bf16TyId, 16, 32, 1}},
       {ROCDL::mfma_f32_16x16x8bf16::getOperationName(),
        {MfmaTypeId::Bf16TyId, 16, 8, 1}},
       {ROCDL::mfma_f32_4x4x2bf16::getOperationName(),
@@ -77,8 +85,12 @@ auto getMfmaInsnInfoMap = []() -> const llvm::StringMap<MfmaInsnInfo> & {
       // i8 (new)
       {ROCDL::mfma_i32_32x32x16_i8::getOperationName(),
        {MfmaTypeId::I8TyId, 32, 16, 1}},
+      {ROCDL::mfma_i32_32x32x32_i8::getOperationName(),
+       {MfmaTypeId::I8TyId, 32, 32, 1}},
       {ROCDL::mfma_i32_16x16x32_i8::getOperationName(),
        {MfmaTypeId::I8TyId, 16, 32, 1}},
+      {ROCDL::mfma_i32_16x16x64_i8::getOperationName(),
+       {MfmaTypeId::I8TyId, 16, 64, 1}},
 
       // fp8
       {ROCDL::mfma_f32_32x32x16_fp8_fp8::getOperationName(),
@@ -178,7 +190,7 @@ static MfmaInsnAttr deriveAttr(MfmaInsnInfo info) {
           isKReduction};
 }
 
-auto getMfmaInsnAttrMap = []() -> const llvm::StringMap<MfmaInsnAttr> & {
+static auto getMfmaInsnAttrMap = []() -> const llvm::StringMap<MfmaInsnAttr> & {
   static llvm::StringMap<MfmaInsnAttr> insnDb;
   static std::once_flag once;
   std::call_once(once, [&]() {
@@ -194,7 +206,7 @@ auto getMfmaInsnAttrMap = []() -> const llvm::StringMap<MfmaInsnAttr> & {
 using MfmaInsnGroupMap =
     llvm::DenseMap<MfmaInsnGroupSelectKey, MfmaInsnGroupAttr,
                    MfmaInsnGroupSelectKeyInfo>;
-auto getMfmaInsnGroupAttrMapAllArch = []() -> const MfmaInsnGroupMap & {
+static auto getMfmaInsnGroupAttrMapAllArch = []() -> const MfmaInsnGroupMap & {
   using amdgpu::MFMAPermB;
   static MfmaInsnGroupMap
       // f32
@@ -242,7 +254,8 @@ auto getMfmaInsnGroupAttrMapAllArch = []() -> const MfmaInsnGroupMap & {
   return groupAttrMap;
 };
 
-auto getMfmaInsnGroupAttrMapGfx908Bf16 = []() -> const MfmaInsnGroupMap & {
+static auto getMfmaInsnGroupAttrMapGfx908Bf16 =
+    []() -> const MfmaInsnGroupMap & {
   using amdgpu::MFMAPermB;
   static MfmaInsnGroupMap
       // bf16
@@ -269,7 +282,7 @@ auto getMfmaInsnGroupAttrMapGfx908Bf16 = []() -> const MfmaInsnGroupMap & {
   return groupAttrMap;
 };
 
-auto getMfmaInsnGroupAttrMapGfx90aPlusBf16 = []() {
+static auto getMfmaInsnGroupAttrMapGfx90aPlusBf16 = []() {
   using amdgpu::MFMAPermB;
   static llvm::DenseMap<MfmaInsnGroupSelectKey, MfmaInsnGroupAttr,
                         MfmaInsnGroupSelectKeyInfo>
@@ -297,7 +310,7 @@ auto getMfmaInsnGroupAttrMapGfx90aPlusBf16 = []() {
   return groupAttrMap;
 };
 
-auto getMfmaInsnGroupAttrMapPreGfx942Int8 = []() {
+static auto getMfmaInsnGroupAttrMapPreGfx942Int8 = []() {
   using amdgpu::MFMAPermB;
   static llvm::DenseMap<MfmaInsnGroupSelectKey, MfmaInsnGroupAttr,
                         MfmaInsnGroupSelectKeyInfo>
@@ -321,7 +334,7 @@ auto getMfmaInsnGroupAttrMapPreGfx942Int8 = []() {
 };
 
 // New I8 and all Float8
-auto getMfmaInsnGroupAttrMapGfx942Plus = []() {
+static auto getMfmaInsnGroupAttrMapGfx942 = []() {
   using amdgpu::MFMAPermB;
   static MfmaInsnGroupMap
       // Int8
@@ -407,6 +420,28 @@ auto getMfmaInsnGroupAttrMapGfx942Plus = []() {
   return groupAttrMap;
 };
 
+static auto getMfmaInsnGroupAttrMapGfx950 = []() {
+  static MfmaInsnGroupMap groupAttrMap{
+      // fp16 double rate
+      {{MfmaTypeId::Fp16TyId, 16, 16},
+       {ROCDL::mfma_f32_16x16x32_f16::getOperationName()}},
+      {{MfmaTypeId::Fp16TyId, 32, 32},
+       {ROCDL::mfma_f32_32x32x16_f16::getOperationName()}},
+      // bfp16 double rate
+      {{MfmaTypeId::Bf16TyId, 16, 16},
+       {ROCDL::mfma_f32_16x16x32_bf16::getOperationName()}},
+      {{MfmaTypeId::Bf16TyId, 32, 32},
+       {ROCDL::mfma_f32_32x32x16_bf16::getOperationName()}},
+      // i8 double rate
+      {{MfmaTypeId::I8TyId, 16, 16},
+       {ROCDL::mfma_i32_16x16x64_i8::getOperationName()}},
+      {{MfmaTypeId::I8TyId, 32, 32},
+       {ROCDL::mfma_i32_32x32x32_i8::getOperationName()}}
+
+  };
+  return groupAttrMap;
+};
+
 FailureOr<MfmaInsn> MfmaInsn::select(StringRef mfmaInsn) {
   auto mfmaInsnAttrMap = getMfmaInsnAttrMap();
   auto it = mfmaInsnAttrMap.find(mfmaInsn);
@@ -546,13 +581,35 @@ FailureOr<MfmaInsnGroup> MfmaInsnGroup::select(Type elementTypeA,
       result = MfmaInsnGroup(elementTypeA, elementTypeB, *maybeInsn, groupAttr);
     }
   };
-  bool hasOldBf16 = arch.contains("gfx908");
-  bool isPreGfx942 = arch.contains("gfx908") || arch.contains("gfx90a");
-  if (elementTypeA.isBF16())
-    selectFrom(hasOldBf16 ? getMfmaInsnGroupAttrMapGfx908Bf16()
-                          : getMfmaInsnGroupAttrMapGfx90aPlusBf16());
-  selectFrom(isPreGfx942 ? getMfmaInsnGroupAttrMapPreGfx942Int8()
-                         : getMfmaInsnGroupAttrMapGfx942Plus());
+  bool isGfx908 = arch.contains("gfx908");
+  bool isGfx90a = arch.contains("gfx908") || arch.contains("gfx90a");
+  bool isGfx94x = arch.contains("gfx942") || arch.contains("gfx940");
+  bool isGfx95x = arch.contains("gfx950");
+  // TODO: refactor this later to not keep multiple maps for different arches
+  if (elementTypeA.isBF16()) {
+    if (isGfx908) {
+      selectFrom(getMfmaInsnGroupAttrMapGfx908Bf16());
+    } else if (isGfx94x || isGfx90a) {
+      selectFrom(getMfmaInsnGroupAttrMapGfx90aPlusBf16());
+    } else {
+      // gfx950 has double rate instructions. Select from those first.
+      selectFrom(getMfmaInsnGroupAttrMapGfx950());
+      selectFrom(getMfmaInsnGroupAttrMapGfx90aPlusBf16());
+    }
+  }
+
+  if (isGfx908 || isGfx90a) {
+    selectFrom(getMfmaInsnGroupAttrMapPreGfx942Int8());
+  } else if (isGfx94x) {
+    selectFrom(getMfmaInsnGroupAttrMapGfx942());
+  } else if (isGfx95x) {
+    // select from new double rate instructions first
+    selectFrom(getMfmaInsnGroupAttrMapGfx950());
+    // all previous instructions are still valid for gfx950
+    selectFrom(getMfmaInsnGroupAttrMapGfx942());
+  }
+  // select from all available instructions on all architectures if it is not
+  // been selected yet
   selectFrom(getMfmaInsnGroupAttrMapAllArch());
   if (failed(result)) {
     LLVM_DEBUG(llvm::dbgs() << "No match found in MFMA database\n");
diff --git a/mlir/lib/Dialect/Rock/Tuning/GridwiseGemmParams.cpp b/mlir/lib/Dialect/Rock/Tuning/GridwiseGemmParams.cpp
@@ -463,10 +463,10 @@ LogicalResult PopulateParamsXDL::isValidBlockwiseGemm(
   if (minDPerWave <= 16) {
     validKPerWaveFactor = 4;
   }
-  if (!((param.getMPerBlock() % minDPerWave == 0) &&
-        (param.getNPerBlock() % minDPerWave == 0) &&
-        ((param.getKpackPerBlock() * param.getKpack()) % validKPerWaveFactor ==
-         0))) {
+  if ((param.getMPerBlock() % minDPerWave != 0) ||
+        (param.getNPerBlock() % minDPerWave != 0) ||
+        ((param.getKpackPerBlock() * param.getKpack()) % validKPerWaveFactor !=
+         0)) {
     return failure();
   }
 
@@ -515,7 +515,7 @@ LogicalResult PopulateParamsXDL::isValidBlockwiseGemm(
 
   // Sledgehammer hotfix because not unrolling sometimes makes the register
   // allocator break. This should be refined quickly.
-  if (cast<RockTuningParamAttrInterface>(param).getForceUnroll() == false) {
+  if (!cast<RockTuningParamAttrInterface>(param).getForceUnroll()) {
     return failure();
   }
 
@@ -585,10 +585,7 @@ PopulateParamsXDL::getTuningParameters(KernelType opType, Type dataTypeA,
           return false;
         }
         MfmaInsnGroup mfmaGroup = *maybeMfmaInsnGroup;
-        if (!mfmaGroup.isCoherentWithK(param.gemmKPack, param.gemmKPerBlock)) {
-          return false;
-        }
-        return true;
+        return mfmaGroup.isCoherentWithK(param.gemmKPack, param.gemmKPerBlock);
       });
   return res;
 }
diff --git a/mlir/test/Dialect/Rock/lowering_xdlops_gemm.mlir b/mlir/test/Dialect/Rock/lowering_xdlops_gemm.mlir
@@ -270,3 +270,171 @@ func.func @accel_gemm_fp8_bf8_ocp(%matrixA : memref<1x4xvector<8xf8E4M3FN>, #gpu
   } : memref<2x2xvector<16xf32>, #gpu.address_space<private>> += memref<1x4xvector<8xf8E4M3FN>, #gpu.address_space<private>> * memref<1x4xvector<8xf8E5M2>, #gpu.address_space<private>>
   return
 }
+
+func.func @accel_gemm_gfx950_f16_16x16x32(%matrixA : memref<1x2xvector<8xf16>, 5>,
+                                                 %matrixB : memref<1x2xvector<8xf16>, 5>,
+                                                 %matrixC : memref<1x1xvector<4xf32>, 5>) {
+  // CHECK-LABEL: func.func @accel_gemm_gfx950_f16_16x16x32
+  // CHECK: rock.transforming_for
+  // CHECK-SAME: bounds [1, 1, 1]
+  // CHECK: amdgpu.mfma
+  // CHECK-SAME: blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32
+  // CHECK-NOT: amdgpu.mfma
+  %c0 = arith.constant 0 : index
+  rock.threadwise_accel_gemm %matrixC += %matrixA * %matrixB at [%c0, %c0, %c0] features = mfma {
+    arch = "amdgcn-amd-amdhsa:gfx950",
+    params = #rock.xdlops_gemm_derived_params<
+      kpackPerBlock = 8,
+      kpack = 8,
+      mPerWave = 16,
+      nPerWave = 16,
+      mPerBlock = 16,
+      nPerBlock = 16,
+      mnPerXdl = 16,
+      splitKFactor = 1, 
+      scheduleVersion = 1, 
+      outputSwizzle = 2,
+      forceUnroll = true>
+  } : memref<1x1xvector<4xf32>, 5> += memref<1x2xvector<8xf16>, 5> * memref<1x2xvector<8xf16>, 5>
+  return
+}
+
+func.func @accel_gemm_gfx950_bf16_16x16x32(%matrixA : memref<1x2xvector<8xbf16>, 5>,
+                                                 %matrixB : memref<1x2xvector<8xbf16>, 5>,
+                                                 %matrixC : memref<1x1xvector<4xf32>, 5>) {
+  // CHECK-LABEL: func.func @accel_gemm_gfx950_bf16_16x16x32
+  // CHECK: rock.transforming_for
+  // CHECK-SAME: bounds [1, 1, 1]
+  // CHECK: amdgpu.mfma
+  // CHECK-SAME: blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32
+  // CHECK-NOT: amdgpu.mfma
+  %c0 = arith.constant 0 : index
+  rock.threadwise_accel_gemm %matrixC += %matrixA * %matrixB at [%c0, %c0, %c0] features = mfma {
+    arch = "amdgcn-amd-amdhsa:gfx950",
+    params = #rock.xdlops_gemm_derived_params<
+      kpackPerBlock = 8,
+      kpack = 8,
+      mPerWave = 16,
+      nPerWave = 16,
+      mPerBlock = 16,
+      nPerBlock = 16,
+      mnPerXdl = 16,
+      splitKFactor = 1, 
+      scheduleVersion = 1, 
+      outputSwizzle = 2,
+      forceUnroll = true>
+  } : memref<1x1xvector<4xf32>, 5> += memref<1x2xvector<8xbf16>, 5> * memref<1x2xvector<8xbf16>, 5>
+  return
+}
+
+func.func @accel_gemm_gfx950_f16_32x32x16(%matrixA : memref<1x2xvector<8xf16>, 5>,
+                                                 %matrixB : memref<1x2xvector<8xf16>, 5>,
+                                                 %matrixC : memref<1x1xvector<16xf32>, 5>) {
+  // CHECK-LABEL: func.func @accel_gemm_gfx950_f16_32x32x16
+  // CHECK: rock.transforming_for
+  // CHECK-SAME: bounds [1, 1, 1]
+  // CHECK: amdgpu.mfma
+  // CHECK-SAME: blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32
+  // CHECK-NOT: amdgpu.mfma
+  %c0 = arith.constant 0 : index
+  rock.threadwise_accel_gemm %matrixC += %matrixA * %matrixB at [%c0, %c0, %c0] features = mfma {
+    arch = "amdgcn-amd-amdhsa:gfx950",
+    params = #rock.xdlops_gemm_derived_params<
+      kpackPerBlock = 4,
+      kpack = 8,
+      mPerWave = 32,
+      nPerWave = 32,
+      mPerBlock = 32,
+      nPerBlock = 32,
+      mnPerXdl = 32,
+      splitKFactor = 1, 
+      scheduleVersion = 1, 
+      outputSwizzle = 2,
+      forceUnroll = true>
+  } : memref<1x1xvector<16xf32>, 5> += memref<1x2xvector<8xf16>, 5> * memref<1x2xvector<8xf16>, 5>
+  return
+}
+
+func.func @accel_gemm_gfx950_bf16_32x32x16(%matrixA : memref<1x2xvector<8xbf16>, 5>,
+                                                 %matrixB : memref<1x2xvector<8xbf16>, 5>,
+                                                 %matrixC : memref<1x1xvector<16xf32>, 5>) {
+  // CHECK-LABEL: func.func @accel_gemm_gfx950_bf16_32x32x16
+  // CHECK: rock.transforming_for
+  // CHECK-SAME: bounds [1, 1, 1]
+  // CHECK: amdgpu.mfma
+  // CHECK-SAME: blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32
+  // CHECK-NOT: amdgpu.mfma
+  %c0 = arith.constant 0 : index
+  rock.threadwise_accel_gemm %matrixC += %matrixA * %matrixB at [%c0, %c0, %c0] features = mfma {
+    arch = "amdgcn-amd-amdhsa:gfx950",
+    params = #rock.xdlops_gemm_derived_params<
+      kpackPerBlock = 4,
+      kpack = 8,
+      mPerWave = 32,
+      nPerWave = 32,
+      mPerBlock = 32,
+      nPerBlock = 32,
+      mnPerXdl = 32,
+      splitKFactor = 1, 
+      scheduleVersion = 1, 
+      outputSwizzle = 2,
+      forceUnroll = true>
+  } : memref<1x1xvector<16xf32>, 5> += memref<1x2xvector<8xbf16>, 5> * memref<1x2xvector<8xbf16>, 5>
+  return
+}
+
+func.func @accel_gemm_gfx950_i8_32x32x32(%matrixA : memref<1x4xvector<16xi8>, 5>,
+                                                 %matrixB : memref<1x4xvector<16xi8>, 5>,
+                                                 %matrixC : memref<1x1xvector<16xi32>, 5>) {
+  // CHECK-LABEL: func.func @accel_gemm_gfx950_i8_32x32x32
+  // CHECK: rock.transforming_for
+  // CHECK-SAME: bounds [1, 1, 1]
+  // CHECK: amdgpu.mfma
+  // CHECK-SAME: blocks = 1 : i32, k = 32 : i32, m = 32 : i32, n = 32 : i32
+  // CHECK-NOT: amdgpu.mfma
+  %c0 = arith.constant 0 : index
+  rock.threadwise_accel_gemm %matrixC += %matrixA * %matrixB at [%c0, %c0, %c0] features = mfma {
+    arch = "amdgcn-amd-amdhsa:gfx950",
+    params = #rock.xdlops_gemm_derived_params<
+      kpackPerBlock = 8,
+      kpack = 16,
+      mPerWave = 32,
+      nPerWave = 32,
+      mPerBlock = 32,
+      nPerBlock = 32,
+      mnPerXdl = 32,
+      splitKFactor = 1, 
+      scheduleVersion = 1, 
+      outputSwizzle = 2,
+      forceUnroll = true>
+  } : memref<1x1xvector<16xi32>, 5> += memref<1x4xvector<16xi8>, 5> * memref<1x4xvector<16xi8>, 5>
+  return
+}
+
+func.func @accel_gemm_gfx950_i8_16x16x64(%matrixA : memref<1x2xvector<16xi8>, 5>,
+                                                 %matrixB : memref<1x2xvector<16xi8>, 5>,
+                                                 %matrixC : memref<1x1xvector<4xi32>, 5>) {
+  // CHECK-LABEL: func.func @accel_gemm_gfx950_i8_16x16x64
+  // CHECK: rock.transforming_for
+  // CHECK-SAME: bounds [1, 1, 1]
+  // CHECK: amdgpu.mfma
+  // CHECK-SAME: blocks = 1 : i32, k = 64 : i32, m = 16 : i32, n = 16 : i32
+  // CHECK-NOT: amdgpu.mfma
+  %c0 = arith.constant 0 : index
+  rock.threadwise_accel_gemm %matrixC += %matrixA * %matrixB at [%c0, %c0, %c0] features = mfma {
+    arch = "amdgcn-amd-amdhsa:gfx950",
+    params = #rock.xdlops_gemm_derived_params<
+      kpackPerBlock = 8,
+      kpack = 16,
+      mPerWave = 16,
+      nPerWave = 16,
+      mPerBlock = 32,
+      nPerBlock = 32,
+      mnPerXdl = 16,
+      splitKFactor = 1, 
+      scheduleVersion = 1, 
+      outputSwizzle = 2,
+      forceUnroll = true>
+  } : memref<1x1xvector<4xi32>, 5> += memref<1x2xvector<16xi8>, 5> * memref<1x2xvector<16xi8>, 5>
+  return
+}