iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/buffer_instructions_optimization.mlir‎
Lines changed: 5 additions & 5 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/buffer_instructions_optimization.mlir‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir‎
Lines changed: 4 additions & 14 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir‎
Lines changed: 4 additions & 14 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir‎
Lines changed: 1 addition & 1 deletion b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir‎
Lines changed: 0 additions & 4 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx950.mlir‎
Lines changed: 1 addition & 1 deletion b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx950.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir‎
Lines changed: 7 additions & 7 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8fnuz.mlir‎
Lines changed: 1 addition & 1 deletion b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8fnuz.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8ocp.mlir‎
Lines changed: 1 addition & 1 deletion b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8ocp.mlir‎
Lines changed: 1 addition & 1 deletion
@@ -134,7 +134,7 @@ func.func @no_simplify_mask_no_fat_raw_buffer(%1 : memref<1x?x?x8xbf16>, %index1
 }
 
 // CHECK-LABEL: @no_simplify_mask_no_fat_raw_buffer
-//  CHECK-SAME:   (%[[ARG0:.+]]: memref<1x?x?x8xbf16>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
+//  CHECK-SAME:   (%[[ARG0:.+]]: memref<1x?x?x8xbf16>, %{{.+}}: index, %{{.+}}: index)
 //   CHECK-DAG: %[[MASK:.+]] = vector.create_mask
 //       CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
 //  CHECK-SAME: %[[MASK]]
@@ -153,7 +153,7 @@ func.func @no_simplify_mask_tensor(%1 : tensor<1x?x?x8xbf16>, %index1 : index, %
 }
 
 // CHECK-LABEL: @no_simplify_mask_tensor
-//  CHECK-SAME:   (%[[ARG0:.+]]: tensor<1x?x?x8xbf16>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
+//  CHECK-SAME:   (%[[ARG0:.+]]: tensor<1x?x?x8xbf16>, %{{.+}}: index, %{{.+}}: index)
 //   CHECK-DAG: %[[MASK:.+]] = vector.create_mask
 //       CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
 //  CHECK-SAME: %[[MASK]]
@@ -172,7 +172,7 @@ func.func @no_simplify_mask_outofbounds(%1 : memref<1x?x?x6xbf16, #amdgpu.addres
 }
 
 // CHECK-LABEL: @no_simplify_mask_outofbounds
-//  CHECK-SAME:   (%[[ARG0:.+]]: memref<1x?x?x6xbf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
+//  CHECK-SAME:   (%[[ARG0:.+]]: memref<1x?x?x6xbf16, #amdgpu.address_space<fat_raw_buffer>>, %{{.+}}: index, %{{.+}}: index)
 //   CHECK-DAG: %[[MASK:.+]] = vector.create_mask
 //       CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
 //  CHECK-SAME: %[[MASK]]
@@ -191,7 +191,7 @@ func.func @no_simplify_partial_mask(%1 : memref<1x?x?x8xbf16, #amdgpu.address_sp
 }
 
 // CHECK-LABEL: @no_simplify_partial_mask
-//  CHECK-SAME:   (%[[ARG0:.+]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
+//  CHECK-SAME:   (%[[ARG0:.+]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %{{.+}}: index, %{{.+}}: index)
 //   CHECK-DAG: %[[MASK:.+]] = vector.create_mask
 //       CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
 //  CHECK-SAME: %[[MASK]]
@@ -210,7 +210,7 @@ func.func @no_simplify_mask_nonunit(%1 : memref<1x?x?x8xbf16, #amdgpu.address_sp
 }
 
 // CHECK-LABEL: @no_simplify_mask_nonunit
-//  CHECK-SAME:   (%[[ARG0:.+]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
+//  CHECK-SAME:   (%[[ARG0:.+]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %{{.+}}: index, %{{.+}}: index)
 //   CHECK-DAG: %[[MASK:.+]] = vector.create_mask
 //       CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
 //  CHECK-SAME: %[[MASK]]
 
@@ -11,7 +11,6 @@
 #map1 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>
 #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>
 func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor<10x64x2048xf16>) -> tensor<2x10x64x64xf32> {
-  %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
   %5 = tensor.empty() : tensor<2x10x64x64xf32>
   %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x10x64x64xf32>) -> tensor<2x10x64x64xf32>
@@ -49,7 +48,6 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d4, d5)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
 func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4x32x128x16xf16>) -> tensor<10x4x32x32xf32> {
-  %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
   %5 = tensor.empty() : tensor<10x4x32x32xf32>
   %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<10x4x32x32xf32>) -> tensor<10x4x32x32xf32>
@@ -119,7 +117,6 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
 
 func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<1024x1024xf16>) -> tensor<1024x1024xf32> {
   %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
   %5 = tensor.empty() : tensor<1024x1024xf32>
   %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
   %7 = linalg.matmul ins(%lhs, %rhs : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
@@ -212,7 +209,6 @@ func.func @mfma_matmul_m_aligned_intrinsic(%lhs: tensor<176x1024xi8>, %rhs: tens
 
 module {
   func.func @conv_nhwc(%3: tensor<2x258x514x768xf16>, %4: tensor<3x3x768x256xf16>) -> tensor<2x256x512x256xf32> {
-    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f32
     %5 = tensor.empty() : tensor<2x256x512x256xf32>
     %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32>
@@ -249,7 +245,6 @@ func.func @matmul_dynamic_M(%arg0: tensor<?x256xf32>, %arg1: tensor<256x256xf32>
 module {
   func.func @elementwise_dynamic_dim(%11: tensor<?x256xf16>, %12: tensor<?x256xf16>) -> tensor<?x256xf16> {
     %c0 = arith.constant 0 : index
-    %cst = arith.constant 0.000000e+00 : f32
     %8 = tensor.dim %11, %c0 : tensor<?x256xf16>
     %13 = tensor.empty(%8) : tensor<?x256xf16>
     %15 = linalg.add ins(%11, %12 : tensor<?x256xf16>, tensor<?x256xf16>) outs(%13 : tensor<?x256xf16>) -> tensor<?x256xf16>
@@ -266,7 +261,6 @@ module {
 // -----
 
 func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf16>) -> tensor<180x180xf16> {
-  %cst = arith.constant 0.000000e+00 : f32
   %13 = tensor.empty() : tensor<180x180xf16>
   %15 = linalg.add ins(%11, %12 : tensor<180x180xf16>, tensor<180x180xf16>) outs(%13 : tensor<180x180xf16>) -> tensor<180x180xf16>
   return %15 : tensor<180x180xf16>
@@ -278,7 +272,6 @@ func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf
 // -----
 
 func.func @elementwise_large_rank(%11: tensor<3x5x7x11x13x17x19x23xf16>, %12: tensor<3x5x7x11x13x17x19x23xf16>) -> tensor<3x5x7x11x13x17x19x23xf16> {
-  %cst = arith.constant 0.000000e+00 : f32
   %13 = tensor.empty() : tensor<3x5x7x11x13x17x19x23xf16>
   %15 = linalg.add ins(%11, %12 : tensor<3x5x7x11x13x17x19x23xf16>, tensor<3x5x7x11x13x17x19x23xf16>) outs(%13 : tensor<3x5x7x11x13x17x19x23xf16>) -> tensor<3x5x7x11x13x17x19x23xf16>
   return %15 : tensor<3x5x7x11x13x17x19x23xf16>
@@ -293,9 +286,6 @@ func.func @elementwise_large_rank(%11: tensor<3x5x7x11x13x17x19x23xf16>, %12: te
 
 func.func @multi_mma_data_tiled_unrolled_MFMA_F32_16x16x4_F32(
       %3: tensor<1x8x8x4x16x4xf32>, %4: tensor<1x8x4x2x4x16x4xf32>, %5: tensor<1x1x4x8x2x4x16x4xf32>) -> tensor<1x1x4x8x2x4x16x4xf32> {
-  %c0 = arith.constant 0 : index
-  %c65536 = arith.constant 65536 : index
-  %c131072 = arith.constant 131072 : index
   %6 = iree_codegen.inner_tiled ins(%3, %4) outs(%5) {
       indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
                        affine_map<(d0, d1, d2) -> (d1, d2)>,
@@ -323,9 +313,9 @@ func.func @multi_mma_data_tiled_unrolled_MFMA_F32_16x16x4_F32(
 // -----
 
 func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x8x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x8x577xf32> {
-    %c0 = arith.constant 0.0 : f32
+    %cst = arith.constant 0.0 : f32
     %empty = tensor.empty() : tensor<12x8x577xf32>
-    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x8x577xf32>) -> tensor<12x8x577xf32>
+    %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<12x8x577xf32>) -> tensor<12x8x577xf32>
     %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x8x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x8x577xf32>) -> tensor<12x8x577xf32>
     return %mm :  tensor<12x8x577xf32>
 }
@@ -413,9 +403,9 @@ func.func @unaligned_dynamic_matmul_with_two_reduce_dim(%arg0: tensor<196x?x4xf3
 // -----
 
 func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x1024xf32>) -> tensor<12x577x1024xf32> {
-    %c0 = arith.constant 0.0 : f32
+    %cst = arith.constant 0.0 : f32
     %empty = tensor.empty() : tensor<12x577x1024xf32>
-    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x1024xf32>) -> tensor<12x577x1024xf32>
+    %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<12x577x1024xf32>) -> tensor<12x577x1024xf32>
     %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x1024xf32>) outs(%fill : tensor<12x577x1024xf32>) -> tensor<12x577x1024xf32>
     return %mm :  tensor<12x577x1024xf32>
 }
 
@@ -34,5 +34,5 @@ func.func @wmma_matmul_1024x1024x1024() {
 // WMMA: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
 // WMMA-SAME:                           mma_kind = #iree_gpu.mma_layout<WMMAR3_F32_16x16x16_F16>
 // WMMA-SAME:                           reduction =  [0, 0, 64]
-// WMMA-LITERAL-SAME:                   subgroup_basis = [[2, 2, 1], [0, 1, 2]]
+// WMMA-SAME{LITERAL}:                  subgroup_basis = [[2, 2, 1], [0, 1, 2]]
 // WMMA-SAME:                           workgroup =  [64, 128, 0]
@@ -61,10 +61,6 @@ func.func @attention_20x1x64x4096x64() {
 func.func @reduction_with_no_consumer() {
     %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f32
-    %cst_0 = arith.constant 4.096000e+04 : f32
-    %cst_1 = arith.constant 9.99999974E-6 : f32
-    %c69524992 = arith.constant 69524992 : index
-    %c74767872 = arith.constant 74767872 : index
     %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x32x10x4096xf16>>
     %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32xf32>>
     %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 4096], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x32x10x4096xf16>> -> tensor<2x32x10x4096xf16>
 
@@ -35,7 +35,7 @@ func.func @skinny_scaled_matmul() {
   %13 = linalg.generic {
     indexing_maps = [#lhs_map, #rhs_map, #scale_m, #scale_n, #out_map],
     iterator_types = ["parallel", "parallel", "reduction", "reduction"]
-  } ins(%6, %8, %9, %10 : tensor<4x512x32xf4E2M1FN>, tensor<1024x512x32xf4E2M1FN>, tensor<4x512xf8E8M0FNU>, tensor<1024x512xf8E8M0FNU>) outs(%11 : tensor<4x1024xf32>) {
+  } ins(%6, %8, %9, %10 : tensor<4x512x32xf4E2M1FN>, tensor<1024x512x32xf4E2M1FN>, tensor<4x512xf8E8M0FNU>, tensor<1024x512xf8E8M0FNU>) outs(%12 : tensor<4x1024xf32>) {
   ^bb0(%a: f4E2M1FN, %b: f4E2M1FN, %a_scale: f8E8M0FNU, %b_scale: f8E8M0FNU, %out: f32):
     %14 = arith.scaling_extf %a, %a_scale : f4E2M1FN, f8E8M0FNU to f32
     %15 = arith.scaling_extf %b, %b_scale : f4E2M1FN, f8E8M0FNU to f32
 
@@ -66,17 +66,17 @@ hal.executable private @main {
 //      CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
 //      CHECK-DAG:   %[[C36:.+]] = arith.constant 36 : index
 //          CHECK:   scf.forall ({{.*}}) in (16, 48, 9) {
-//          CHECK:     %[[LOOP1:.+]] = scf.for %[[IV1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:       %[[LOOP2:.+]] = scf.for %[[IV2:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:         %[[LOOP3:.+]] = scf.for %[[IV3:.+]] = %[[C0]] to %[[C36]] step %[[C4]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
+//          CHECK:     scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
+//          CHECK:       scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
+//          CHECK:         scf.for {{.+}} = %[[C0]] to %[[C36]] step %[[C4]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
 //          CHECK:           gpu.barrier
-//      CHECK-DAG:           %[[LHS_RD:.+]] = vector.transfer_read %[[BUF0]]{{.*}} vector<4xf16>
+//      CHECK-DAG:           %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
 //      CHECK-DAG:           vector.transfer_write %[[LHS_RD]]
-//      CHECK-DAG:           %[[RHS_RD:.+]] = vector.transfer_read %[[BUF1]]{{.*}} vector<8xf16>
+//      CHECK-DAG:           %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
 //      CHECK-DAG:           vector.transfer_write %[[RHS_RD]]
 //          CHECK:           gpu.barrier
-//      CHECK-DAG:           %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x4xf16>
-//      CHECK-DAG:           %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x4xf16>
+//      CHECK-DAG:           vector.transfer_read {{.*}} vector<4x4xf16>
+//      CHECK-DAG:           vector.transfer_read {{.*}} vector<4x4xf16>
 //  CHECK-COUNT-4:           amdgpu.mfma 16x16x16
 //          CHECK:     vector.transfer_write %{{.*}}, %[[BUF2]]
 //          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
@@ -43,7 +43,7 @@ hal.executable @ext_fp8_dispatch {
 
 // ERRORS: F8E5M2FNUZ and F8E4M3FNUZ types are not supported on non-gfx942 (MI-300) chipsets; try F8E5M2 or F8E4M3FN instead.
 
-//   CDNA3-LABEL: hal.executable public @ext_fp8_dispatch
+//   CDNA3-LABEL: hal.executable public @ext_fp8_dispatch {
 //         CDNA3:   hal.executable.variant public @rocm
 // CDNA3-COUNT-8:     rocdl.cvt.pk.f32.fp8 %{{.*}} : vector<2xf32>
 // CDNA3-COUNT-8:     rocdl.cvt.pk.f32.bf8 %{{.*}} : vector<2xf32>
 
@@ -44,7 +44,7 @@ hal.executable @ext_fp8_dispatch {
 
 // ERRORS: F8E5M2 and F8E4M3FN types are not supported on gfx942 (MI-300) or older chipsets; try F8E5M2FNUZ or F8E4M3FNUZ instead.
 
-//   OCP-LABEL: hal.executable public @ext_fp8_dispatch
+//   OCP-LABEL: hal.executable public @ext_fp8_dispatch {
 //         OCP:   hal.executable.variant public @rocm
 // OCP-COUNT-8:     rocdl.cvt.pk.f32.fp8 %{{.*}} : vector<2xf32>
 // OCP-COUNT-8:     rocdl.cvt.pk.f32.bf8 %{{.*}} : vector<2xf32>