MaheshRavishankar
diff --git a/‎compiler/plugins/target/ROCM/ROCMTarget.cpp‎
Lines changed: 0 additions & 1 deletion b/‎compiler/plugins/target/ROCM/ROCMTarget.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎compiler/src/iree/compiler/API/Internal/LLDToolEntryPoint.cpp‎
Lines changed: 2 additions & 2 deletions b/‎compiler/src/iree/compiler/API/Internal/LLDToolEntryPoint.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir‎
Lines changed: 14 additions & 22 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir‎
Lines changed: 14 additions & 22 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp‎
Lines changed: 2 additions & 1 deletion b/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_inner_tiled.mlir‎
Lines changed: 10 additions & 18 deletions b/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_inner_tiled.mlir‎
Lines changed: 10 additions & 18 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir‎
Lines changed: 1 addition & 1 deletion b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir‎
Lines changed: 1 addition & 1 deletion
@@ -639,7 +639,6 @@ class ROCMTargetBackend final : public TargetBackend {
         }
         llvm::TargetOptions opt;
         opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-        opt.UnsafeFPMath = false;
         opt.NoInfsFPMath = false;
         opt.NoNaNsFPMath = true;
         // Be extra cautious while this is less tested, and prevent unknown
 
@@ -59,8 +59,8 @@ LLD_HAS_DRIVER(wasm)
 
 static Flavor getFlavor(StringRef s) {
   return StringSwitch<Flavor>(s)
-      .CasesLower("ld", "ld.lld", "gnu", Gnu)
-      .CasesLower("wasm", "ld-wasm", Wasm)
+      .CasesLower({"ld", "ld.lld", "gnu"}, Gnu)
+      .CasesLower({"wasm", "ld-wasm"}, Wasm)
       .CaseLower("link", WinLink)
       .CasesLower(
           {"ld64", "ld64.lld", "darwin", "darwinnew", "ld64.lld.darwinnew"},
 
@@ -77,8 +77,7 @@ builtin.module attributes { transform.with_named_sequence } {
 // CHECK:       %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x4xf16> to vector<4xf16>
 // CHECK:       %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x4x1xf16> to vector<4xf16>
 // CHECK:       %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<4x1x4x1xf32> to vector<16xf32>
-// CHECK:       %[[MFMA:.+]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
-// CHECK-SAME:     {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp =  none
+// CHECK:       %[[MFMA:.+]] = amdgpu.mfma 32x32x8 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp =  none
 // CHECK-SAME:     : vector<4xf16>, vector<4xf16>, vector<16xf32>
 // CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<16xf32> to vector<4x1x4x1xf32>
 // CHECK:       %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
@@ -154,8 +153,7 @@ builtin.module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x4xf16> to vector<4xf16>
 //       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x4x1xf16> to vector<4xf16>
 //       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x4x1xf32> to vector<4xf32>
-//       CHECK:   %[[MFMA:.+]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
-//  CHECK-SAME:     {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none
+//       CHECK:   %[[MFMA:.+]] = amdgpu.mfma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp =  none
 //  CHECK-SAME:     : vector<4xf16>, vector<4xf16>, vector<4xf32>
 
 //       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]]  : vector<4xf32> to vector<1x1x4x1xf32>
@@ -238,13 +236,13 @@ builtin.module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[A_SLICE0:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x4xf16> from vector<2x1x1x1x1x4xf16>
 //       CHECK:   %[[A0_CAST:.+]] = vector.shape_cast %[[A_SLICE0]] : vector<1x1x1x4xf16> to vector<4xf16>
 //       CHECK:   %[[C0_CAST:.+]] = vector.shape_cast %[[C_SLICE0]] : vector<4x1x4x1xf32> to vector<16xf32>
-//       CHECK:   %[[MFMA0:.+]] = amdgpu.mfma %[[A0_CAST]] * %{{.+}} + %[[C0_CAST]]
+//       CHECK:   %[[MFMA0:.+]] = amdgpu.mfma 32x32x8 %[[A0_CAST]] * %{{.+}} + %[[C0_CAST]]
 //       CHECK:   %[[R0_CAST:.+]] = vector.shape_cast %[[MFMA0]] : vector<16xf32> to vector<4x1x4x1xf32>
 //       CHECK:   %[[C_SLICE1:.+]] = vector.extract %[[C_SIMT]][1, 0] : vector<4x1x4x1xf32> from vector<2x1x4x1x4x1xf32>
 //       CHECK:   %[[A_SLICE1:.+]] = vector.extract %[[A_SIMT]][1, 0] : vector<1x1x1x4xf16> from vector<2x1x1x1x1x4xf16>
 //       CHECK:   %[[A1_CAST:.+]] = vector.shape_cast %[[A_SLICE1]] : vector<1x1x1x4xf16> to vector<4xf16>
 //       CHECK:   %[[C1_CAST:.+]] = vector.shape_cast %[[C_SLICE1]] : vector<4x1x4x1xf32> to vector<16xf32>
-//       CHECK:   %[[MFMA1:.+]] = amdgpu.mfma %[[A1_CAST]] * %{{.+}} + %[[C1_CAST]]
+//       CHECK:   %[[MFMA1:.+]] = amdgpu.mfma 32x32x8 %[[A1_CAST]] * %{{.+}} + %[[C1_CAST]]
 //       CHECK:   %[[R1_CAST:.+]] = vector.shape_cast %[[MFMA1]] : vector<16xf32> to vector<4x1x4x1xf32>
 //       CHECK:   %[[R0:.+]]:16 = vector.to_elements %[[R0_CAST]] : vector<4x1x4x1xf32>
 //       CHECK:   %[[R1:.+]]:16 = vector.to_elements %[[R1_CAST]] : vector<4x1x4x1xf32>
@@ -329,12 +327,12 @@ builtin.module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[B_SLICE0:.+]] = vector.extract %[[B_SIMT]][0, 0]
 //       CHECK:   %[[A0_CAST:.+]] = vector.shape_cast %[[A_SLICE0]]
 //       CHECK:   %[[B0_CAST:.+]] = vector.shape_cast %[[B_SLICE0]]
-//       CHECK:   %[[MFMA0:.+]] = amdgpu.mfma %[[A0_CAST]] * %[[B0_CAST]] + %{{.+}}
+//       CHECK:   %[[MFMA0:.+]] = amdgpu.mfma 32x32x8 %[[A0_CAST]] * %[[B0_CAST]] + %{{.+}}
 //       CHECK:   %[[A_SLICE1:.+]] = vector.extract %[[A_SIMT]][0, 1]
 //       CHECK:   %[[B_SLICE1:.+]] = vector.extract %[[B_SIMT]][1, 0]
 //       CHECK:   %[[A1_CAST:.+]] = vector.shape_cast %[[A_SLICE1]]
 //       CHECK:   %[[B1_CAST:.+]] = vector.shape_cast %[[B_SLICE1]]
-//       CHECK:   %[[MFMA1:.+]] = amdgpu.mfma %[[A1_CAST]] * %[[B1_CAST]] + %[[MFMA0]]
+//       CHECK:   %[[MFMA1:.+]] = amdgpu.mfma 32x32x8 %[[A1_CAST]] * %[[B1_CAST]] + %[[MFMA0]]
 //       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[MFMA1]]
 
 // -----
@@ -584,7 +582,7 @@ builtin.module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x16xf16> to vector<16xf1
 //       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x16x1xf16> to vector<16xf1
 //       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<8x1x1x1xf32> to vector<8xf32>
-//       CHECK:   %[[WMMA:.+]] = amdgpu.wmma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
+//       CHECK:   %[[WMMA:.+]] = amdgpu.wmma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
 //       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<8x1x1x1xf32>
 //       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<8x1x1x1xf32> to vector<1x1x8x1x1x1xf32>
 //       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x8x1x1x1xf32> -> vector<16x16xf32>
@@ -670,7 +668,7 @@ builtin.module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x8xf16> to vector<8xf16>
 //       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x8x1xf16> to vector<8xf16>
 //       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
-//       CHECK:   %[[WMMA:.+]] = amdgpu.wmma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
+//       CHECK:   %[[WMMA:.+]] = amdgpu.wmma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
 //       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
 //       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
 //       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
@@ -756,13 +754,11 @@ builtin.module attributes { transform.with_named_sequence } {
 // CHECK:       %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<4x1x4x1xf32> to vector<16xf32>
 // CHECK:       %[[A_SLICE_0:.+]] = vector.extract_strided_slice %[[A_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 // CHECK:       %[[B_SLICE_0:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
-// CHECK:       %[[MFMA_0:.*]] = amdgpu.mfma %[[A_SLICE_0]] * %[[B_SLICE_0]] + %[[C_CAST]]
-// CHECK-SAME:     {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp =  none
+// CHECK:       %[[MFMA_0:.*]] = amdgpu.mfma 32x32x8 %[[A_SLICE_0]] * %[[B_SLICE_0]] + %[[C_CAST]] blgp =  none
 // CHECK-SAME:     : vector<4xf16>, vector<4xf16>, vector<16xf32>
 // CHECK:       %[[A_SLICE_1:.+]] = vector.extract_strided_slice %[[A_CAST]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 // CHECK:       %[[B_SLICE_1:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
-// CHECK:       %[[MFMA_1:.+]] = amdgpu.mfma %[[A_SLICE_1]] * %[[B_SLICE_1]] + %[[MFMA_0]]
-// CHECK-SAME:     {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp =  none
+// CHECK:       %[[MFMA_1:.+]] = amdgpu.mfma 32x32x8 %[[A_SLICE_1]] * %[[B_SLICE_1]] + %[[MFMA_0]] blgp =  none
 // CHECK-SAME:     : vector<4xf16>, vector<4xf16>, vector<16xf32>
 // CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_1]] : vector<16xf32> to vector<4x1x4x1xf32>
 // CHECK:       %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
@@ -831,20 +827,16 @@ builtin.module attributes { transform.with_named_sequence } {
 // CHECK:       %[[A_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
 // CHECK:       %[[B_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
 // CHECK:       %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1xf32> to vector<4xf32>
-// CHECK:       %[[MFMA_0:.*]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
-// CHECK-SAME:     {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp =  none
+// CHECK:       %[[MFMA_0:.*]] = amdgpu.mfma 16x16x32 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp =  none
 // CHECK:       %[[A_CAST_1:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
 // CHECK:       %[[B_CAST_1:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
-// CHECK:       %[[MFMA_1:.*]] = amdgpu.mfma %[[A_CAST_1]] * %[[B_CAST_1]] + %[[MFMA_0]]
-// CHECK-SAME:     {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp =  none
+// CHECK:       %[[MFMA_1:.*]] = amdgpu.mfma 16x16x32 %[[A_CAST_1]] * %[[B_CAST_1]] + %[[MFMA_0]] blgp =  none
 // CHECK:       %[[MFMA_1_CAST:.*]] = vector.shape_cast %[[MFMA_1]] : vector<4xf32> to vector<1x1x4x1xf32>
 // CHECK:       %[[B_CAST_2:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
 // CHECK:       %[[C_CAST_1:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1xf32> to vector<4xf32>
-// CHECK:       %[[MFMA_2:.*]] = amdgpu.mfma %[[A_CAST]] * %[[B_CAST_2]] + %[[C_CAST_1]]
-// CHECK-SAME:     {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp =  none
+// CHECK:       %[[MFMA_2:.*]] = amdgpu.mfma 16x16x32 %[[A_CAST]] * %[[B_CAST_2]] + %[[C_CAST_1]] blgp =  none
 // CHECK:       %[[B_CAST_3:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf8E4M3FNUZ> to vector<8xf8E4M3FNUZ>
-// CHECK:       %[[MFMA_3:.*]] = amdgpu.mfma %[[A_CAST_1]] * %[[B_CAST_3]] + %[[MFMA_2]]
-// CHECK-SAME:     {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp =  none
+// CHECK:       %[[MFMA_3:.*]] = amdgpu.mfma 16x16x32 %[[A_CAST_1]] * %[[B_CAST_3]] + %[[MFMA_2]] blgp =  none
 // CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_3]] : vector<4xf32> to vector<1x1x4x1xf32>
 // CHECK:       %[[R0:.+]]:4 = vector.to_elements %[[MFMA_1_CAST]] : vector<1x1x4x1xf32>
 // CHECK:       %[[R1:.+]]:4 = vector.to_elements %[[R_CAST]] : vector<1x1x4x1xf32>
 
@@ -671,7 +671,8 @@ static Value createMmaOp(OpBuilder &builder, Location loc,
         .getResult();
   }
   if (is_AMD_WMMA(intrinsic)) {
-    return amdgpu::WMMAOp::create(builder, loc, resultType, lhs, rhs, acc)
+    return amdgpu::WMMAOp::create(builder, loc, resultType, layout.mSize,
+                                  layout.nSize, layout.kSize, lhs, rhs, acc)
         .getResult();
   }
   return {};
 
@@ -29,8 +29,7 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: vector<4xf16>
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: vector<4xf16>
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: vector<4xf32>
-//       CHECK:   amdgpu.mfma %[[LHS]] * %[[RHS]] + %[[ACC]]
-//  CHECK-SAME:     blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
+//       CHECK:   amdgpu.mfma 16x16x16 %[[LHS]] * %[[RHS]] + %[[ACC]]
 //  CHECK-SAME:     blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
 
 // -----
@@ -64,8 +63,7 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: vector<4xf16>
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: vector<4xf16>
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: vector<16xf32>
-//       CHECK:   amdgpu.mfma %[[LHS]] * %[[RHS]] + %[[ACC]]
-//  CHECK-SAME:     blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32
+//       CHECK:   amdgpu.mfma 32x32x8 %[[LHS]] * %[[RHS]] + %[[ACC]]
 //  CHECK-SAME:     blgp =  none : vector<4xf16>, vector<4xf16>, vector<16xf32>
 
 // -----
@@ -99,8 +97,7 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: vector<4xf16>
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: vector<4xf16>
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: vector<16xf32>
-//       CHECK:   amdgpu.mfma %[[RHS]] * %[[LHS]] + %[[ACC]]
-//  CHECK-SAME:     blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32
+//       CHECK:   amdgpu.mfma 32x32x8 %[[RHS]] * %[[LHS]] + %[[ACC]]
 //  CHECK-SAME:     blgp =  none : vector<4xf16>, vector<4xf16>, vector<16xf32>
 
 // -----
@@ -137,12 +134,10 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: vector<4xf32>
 //  CHECK: %[[LHS0:.*]] = vector.extract_strided_slice %[[LHS]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 //  CHECK: %[[RHS0:.*]] = vector.extract_strided_slice %[[RHS]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
-//  CHECK: %[[ACC0:.*]] = amdgpu.mfma %[[RHS0]] * %[[LHS0]] + %[[ACC]]
-//  CHECK-SAME: {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32}
+//  CHECK: %[[ACC0:.*]] = amdgpu.mfma 16x16x16 %[[RHS0]] * %[[LHS0]] + %[[ACC]]
 //  CHECK: %[[LHS1:.*]] = vector.extract_strided_slice %[[LHS]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 //  CHECK: %[[RHS1:.*]] = vector.extract_strided_slice %[[RHS]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
-//  CHECK: %[[ACC1:.*]] = amdgpu.mfma %[[RHS1]] * %[[LHS1]] + %[[ACC0]]
-//  CHECK-SAME: {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32}
+//  CHECK: %[[ACC1:.*]] = amdgpu.mfma 16x16x16 %[[RHS1]] * %[[LHS1]] + %[[ACC0]]
 //  CHECK: return %[[ACC1]] : vector<4xf32>
 
 // -----
@@ -176,7 +171,7 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: vector<16xf16>
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: vector<16xf16>
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: vector<8xf32>
-//       CHECK:   amdgpu.wmma %[[LHS]] * %[[RHS]] + %[[ACC]]
+//       CHECK:   amdgpu.wmma 16x16x16 %[[LHS]] * %[[RHS]] + %[[ACC]]
 //  CHECK-SAME:     : vector<16xf16>, vector<16xf16>, vector<8xf32>
 
 // -----
@@ -210,7 +205,7 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: vector<8xf16>
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: vector<8xf16>
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: vector<8xf32>
-//       CHECK:   amdgpu.wmma %[[LHS]] * %[[RHS]] + %[[ACC]]
+//       CHECK:   amdgpu.wmma 16x16x16 %[[LHS]] * %[[RHS]] + %[[ACC]]
 //  CHECK-SAME:     : vector<8xf16>, vector<8xf16>, vector<8xf32>
 
 // -----
@@ -247,8 +242,7 @@ module attributes { transform.with_named_sequence } {
 //   CHECK-DAG:   %[[LHSCAST:.+]] = vector.shape_cast %[[LHS]] : vector<1x4xf16> to vector<4xf16>
 //   CHECK-DAG:   %[[RHSCAST:.+]] = vector.shape_cast %[[RHS]] : vector<4x1xf16> to vector<4xf16>
 //   CHECK-DAG:   %[[ACCCAST:.+]] = vector.shape_cast %[[ACC]] : vector<4x1xf32> to vector<4xf32>
-//       CHECK:   %[[MMA:.+]] = amdgpu.mfma %[[LHSCAST]] * %[[RHSCAST]] + %[[ACCCAST]]
-//  CHECK-SAME:     blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
+//       CHECK:   %[[MMA:.+]] = amdgpu.mfma 16x16x16 %[[LHSCAST]] * %[[RHSCAST]] + %[[ACCCAST]]
 //  CHECK-SAME:     blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
 //       CHECK:   vector.shape_cast %[[MMA]] : vector<4xf32> to vector<4x1xf32>
 
@@ -296,8 +290,7 @@ module attributes { transform.with_named_sequence } {
 //  CHECK: %[[LHS_SCALE_LONG:.+]] = vector.insert %[[LHS_SCALE_SCALAR]], %[[CST]] [0]
 //  CHECK: %[[RHS_SCALE_SCALAR:.+]] = vector.extract %[[RHS_SCALE]][0]
 //  CHECK: %[[RHS_SCALE_LONG:.+]] = vector.insert %[[RHS_SCALE_SCALAR]], %[[CST]] [0]
-//  CHECK: amdgpu.scaled_mfma(%[[LHS_SCALE_LONG]][0] * %[[LHS]]) * (%[[RHS_SCALE_LONG]][0] * %[[RHS]]) + %[[ACC]]
-//  CHECK-SAME: k = 128 : i32, m = 16 : i32, n = 16 : i32
+//  CHECK: amdgpu.scaled_mfma 16x16x128 (%[[LHS_SCALE_LONG]][0] * %[[LHS]]) * (%[[RHS_SCALE_LONG]][0] * %[[RHS]]) + %[[ACC]]
 //  CHECK-SAME: vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, vector<4xf32>
 
 // -----
@@ -344,6 +337,5 @@ module attributes { transform.with_named_sequence } {
 //  CHECK: %[[LHS_SCALE_LONG:.+]] = vector.insert %[[LHS_SCALE_SCALAR]], %[[CST]] [0]
 //  CHECK: %[[RHS_SCALE_SCALAR:.+]] = vector.extract %[[RHS_SCALE]][0]
 //  CHECK: %[[RHS_SCALE_LONG:.+]] = vector.insert %[[RHS_SCALE_SCALAR]], %[[CST]] [0]
-//  CHECK: amdgpu.scaled_mfma(%[[LHS_SCALE_LONG]][0] * %[[LHS]]) * (%[[RHS_SCALE_LONG]][0] * %[[RHS]]) + %[[ACC]]
-//  CHECK-SAME: k = 64 : i32, m = 32 : i32, n = 32 : i32
+//  CHECK: amdgpu.scaled_mfma 32x32x64 (%[[LHS_SCALE_LONG]][0] * %[[LHS]]) * (%[[RHS_SCALE_LONG]][0] * %[[RHS]]) + %[[ACC]]
 //  CHECK-SAME: vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, vector<16xf32>
@@ -77,6 +77,6 @@ hal.executable private @main {
 //          CHECK:           gpu.barrier
 //      CHECK-DAG:           %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x4xf16>
 //      CHECK-DAG:           %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x4xf16>
-//  CHECK-COUNT-4:           amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
+//  CHECK-COUNT-4:           amdgpu.mfma 16x16x16
 //          CHECK:     vector.transfer_write %{{.*}}, %[[BUF2]]
 //          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
Original file line number	Diff line number	Diff line change
`@@ -639,7 +639,6 @@ class ROCMTargetBackend final : public TargetBackend {`
`639`	`639`	`}`
`640`	`640`	`llvm::TargetOptions opt;`
`641`	`641`	`opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;`
`642`		`- opt.UnsafeFPMath = false;`
`643`	`642`	`opt.NoInfsFPMath = false;`
`644`	`643`	`opt.NoNaNsFPMath = true;`
`645`	`644`	`// Be extra cautious while this is less tested, and prevent unknown`
Original file line number	Diff line number	Diff line change
`@@ -671,7 +671,8 @@ static Value createMmaOp(OpBuilder &builder, Location loc,`
`671`	`671`	`.getResult();`
`672`	`672`	`}`
`673`	`673`	`if (is_AMD_WMMA(intrinsic)) {`
`674`		`- return amdgpu::WMMAOp::create(builder, loc, resultType, lhs, rhs, acc)`
	`674`	`+ return amdgpu::WMMAOp::create(builder, loc, resultType, layout.mSize,`
	`675`	`+ layout.nSize, layout.kSize, lhs, rhs, acc)`
`675`	`676`	`.getResult();`
`676`	`677`	`}`
`677`	`678`	`return {};`