iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/decompose_horizontally_fused_gemms.mlir‎
Lines changed: 9 additions & 15 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/decompose_horizontally_fused_gemms.mlir‎
Lines changed: 9 additions & 15 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_alloc_private_memory_for_dps_ops.mlir‎
Lines changed: 2 additions & 3 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_alloc_private_memory_for_dps_ops.mlir‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_derived_thread_config.mlir‎
Lines changed: 19 additions & 25 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_derived_thread_config.mlir‎
Lines changed: 19 additions & 25 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_padding_online_attention.mlir‎
Lines changed: 2 additions & 2 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_padding_online_attention.mlir‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_padding_partial_reduction.mlir‎
Lines changed: 11 additions & 11 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_padding_partial_reduction.mlir‎
Lines changed: 11 additions & 11 deletions
@@ -4,11 +4,10 @@ func.func @fused_contraction_1(%arg0: tensor<2x4096x640xf16>,
     %arg1: tensor<10x64x640xf16>, %arg2: tensor<10x64x640xf16>,
     %arg3: tensor<10x64x640xf16>)
     -> (tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>) {
-    %0 = tensor.empty() : tensor<2x10x4096x64xf16>
-    %1 = tensor.empty() : tensor<2x10x4096x64xf32>
+    %0 = tensor.empty() : tensor<2x10x4096x64xf32>
     %cst = arith.constant 0.000000e+00 : f32
-    %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x10x4096x64xf32>) -> tensor<2x10x4096x64xf32>
-    %3:3 = linalg.generic {
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x10x4096x64xf32>) -> tensor<2x10x4096x64xf32>
+    %2:3 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)>,
                          affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
                          affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
@@ -18,7 +17,7 @@ func.func @fused_contraction_1(%arg0: tensor<2x4096x640xf16>,
                          affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>],
         iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]}
         ins(%arg0, %arg1, %arg2, %arg3 : tensor<2x4096x640xf16>, tensor<10x64x640xf16>, tensor<10x64x640xf16>, tensor<10x64x640xf16>)
-        outs(%2, %2, %2 : tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>)
+        outs(%1, %1, %1 : tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>)
         attrs = {
             lowering_config = #iree_gpu.lowering_config<{
                 mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1, 2, 3],
@@ -36,7 +35,7 @@ func.func @fused_contraction_1(%arg0: tensor<2x4096x640xf16>,
       %16 = arith.addf %out_4, %15 : f32
       linalg.yield %10, %13, %16 : f32, f32, f32
   } -> (tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>)
-  return %3#0, %3#1, %3#2 : tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>
+  return %2#0, %2#1, %2#2 : tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>
 }
 // CHECK-LABEL: func @fused_contraction_1
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<2x4096x640xf16>
@@ -106,10 +105,9 @@ func.func @fused_contraction_2(%arg0: tensor<4096x640xf32>,
     %arg1: tensor<640x640xf32>, %arg2: tensor<640x640xf32>, %arg3: tensor<640x640xf32>)
     -> (tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>) {
     %0 = tensor.empty() : tensor<4096x640xf32>
-    %1 = tensor.empty() : tensor<4096x640xf32>
     %cst = arith.constant 0.000000e+00 : f32
-    %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<4096x640xf32>) -> tensor<4096x640xf32>
-    %3:3 = linalg.generic {
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4096x640xf32>) -> tensor<4096x640xf32>
+    %2:3 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
                          affine_map<(d0, d1, d2) -> (d2, d1)>,
                          affine_map<(d0, d1, d2) -> (d2, d1)>,
@@ -119,7 +117,7 @@ func.func @fused_contraction_2(%arg0: tensor<4096x640xf32>,
                          affine_map<(d0, d1, d2) -> (d0, d1)>],
         iterator_types = ["parallel", "parallel", "reduction"]}
         ins(%arg0, %arg1, %arg2, %arg3 : tensor<4096x640xf32>, tensor<640x640xf32>, tensor<640x640xf32>, tensor<640x640xf32>)
-        outs(%2, %2, %2 : tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>)
+        outs(%1, %1, %1 : tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>)
         attrs = {
             lowering_config = #iree_gpu.lowering_config<{
                 mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1, 2, 3],
@@ -133,13 +131,9 @@ func.func @fused_contraction_2(%arg0: tensor<4096x640xf32>,
       %9 = arith.addf %out_4, %8 : f32
       linalg.yield %5, %7, %9 : f32, f32, f32
   } -> (tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>)
-  return %3#0, %3#1, %3#2 : tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>
+  return %2#0, %2#1, %2#2 : tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>
 }
 // CHECK-LABEL: func @fused_contraction_2
-//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<4096x640xf32>
-//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<640x640xf32>
-//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<640x640xf32>
-//  CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: tensor<640x640xf32>
 //       CHECK:   %[[FILL:.+]] = linalg.fill
 //       CHECK:   %[[GENERIC0:.+]] = linalg.generic
 //       CHECK:     ^bb0(%[[B0_0:[a-zA-Z0-9_]+]]: f32, %[[B1_0:[a-zA-Z0-9_]+]]: f32, %[[B2_0:[a-zA-Z0-9_]+]]: f32
 
@@ -19,7 +19,6 @@ func.func @unused_result_copied(%arg0: !iree_tensor_ext.dispatch.tensor<readonly
 // CHECK-LABEL: func.func @big_result_not_copied
 // CHECK-NOT: bufferization.alloc_tensor()
 func.func @big_result_not_copied(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x33xf32>>, %arg1: tensor<1x33xi64>) -> tensor<1x33xi64> {
-  %cst = arith.constant dense<1.000000e+00> : tensor<1x33xf32>
   %2 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1, 33], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x33xf32>> -> tensor<1x33xf32>
   %3:2 = iree_linalg_ext.sort dimension(1) outs(%2, %arg1 :tensor<1x33xf32>, tensor<1x33xi64>) {
     ^bb0(%arg4: f32, %arg5: f32, %arg6: i64, %arg7: i64):
@@ -45,9 +44,9 @@ func.func @used_result_not_copied(%arg0: !iree_tensor_ext.dispatch.tensor<readon
 
 // -----
 
-// CHECK-LABEL: func @memref_semantics(
+// CHECK-LABEL: func.func @memref_semantics(
 //  CHECK-SAME:   %[[DEST:.+]]: memref<?x?xf32>
-//       CHECK:   linalg.fill {{.*}} outs(%[[DEST]]
+//       CHECK:   linalg.fill ins(%{{.+}}) outs(%[[DEST]]
 func.func @memref_semantics(%dest: memref<?x?xf32>) {
   %cst = arith.constant 0.000000e+00 : f32
   linalg.fill ins(%cst : f32) outs(%dest : memref<?x?xf32>)
 
@@ -5,12 +5,12 @@
 module {
   func.func @inferred_add_tensor(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: tensor<64x256xf32>) -> tensor<64x256xf32>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64>
       } {
     %6 = linalg.generic {
       indexing_maps = [#map, #map, #map],
       iterator_types = ["parallel", "parallel"]
-      } ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>) outs(%5 : tensor<64x256xf32>) attrs =  {lowering_config = #config} {
+      } ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>) outs(%5 : tensor<64x256xf32>) attrs = {lowering_config = #config} {
     ^bb0(%in: f32, %in_0: f32, %out: f32):
       %7 = arith.addf %in, %in_0 : f32
       linalg.yield %7 : f32
@@ -32,12 +32,12 @@ module {
 module {
   func.func @inferred_dynamic(%3: tensor<?x?xf32>, %4: tensor<?x?xf32>, %5: tensor<?x?xf32>) -> tensor<?x?xf32>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64>
       } {
     %6 = linalg.generic {
       indexing_maps = [#map, #map, #map],
       iterator_types = ["parallel", "parallel"]
-      } ins(%3, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%5 : tensor<?x?xf32>) attrs =  {lowering_config = #config} {
+      } ins(%3, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%5 : tensor<?x?xf32>) attrs = {lowering_config = #config} {
     ^bb0(%in: f32, %in_0: f32, %out: f32):
       %7 = arith.addf %in, %in_0 : f32
       linalg.yield %7 : f32
@@ -62,12 +62,12 @@ module {
 module {
   func.func @inferred_small_inner_dim(%3: tensor<8x2xf32>, %4: tensor<8x2xf32>, %5: tensor<8x2xf32>) -> tensor<8x2xf32>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64>
       } {
     %6 = linalg.generic {
       indexing_maps = [#map, #map, #map],
       iterator_types = ["parallel", "parallel"]
-      } ins(%3, %4 : tensor<8x2xf32>, tensor<8x2xf32>) outs(%5 : tensor<8x2xf32>) attrs =  {lowering_config = #config} {
+      } ins(%3, %4 : tensor<8x2xf32>, tensor<8x2xf32>) outs(%5 : tensor<8x2xf32>) attrs = {lowering_config = #config} {
     ^bb0(%in: f32, %in_0: f32, %out: f32):
       %7 = arith.addf %in, %in_0 : f32
       linalg.yield %7 : f32
@@ -84,11 +84,10 @@ module {
 
 // -----
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
 module {
   func.func @inferred_small_inner_dim_fill_vector_sizes(%0: tensor<4x16x8x4x16x2x4xf16>, %1: tensor<4x16x8x4x16x2x4xf16>) -> tensor<4x16x8x4x16x2x4xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
       } {
     %2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config}
         ins(%0 : tensor<4x16x8x4x16x2x4xf16>)
@@ -105,12 +104,11 @@ module {
 
 // -----
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
 module {
   func.func @inferred_small_inner_dim_dont_fill_non_contiguous(
     %0: tensor<4x16x4x4xf16>, %1: tensor<4x16x4x4xf16>) -> tensor<4x16x4x4xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
       } {
     %2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config}
         ins(%0 : tensor<4x16x4x4xf16>)
@@ -127,11 +125,10 @@ module {
 
 // -----
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
 module {
   func.func @inferred_unaligned(%0: tensor<70xf16>, %1: tensor<70xf16>) -> tensor<70xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
       } {
     %2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config}
         ins(%0 : tensor<70xf16>)
@@ -148,11 +145,10 @@ module {
 
 // -----
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
 module {
   func.func @inferred_smaller_load(%0: tensor<128xf16>, %1: tensor<128xf16>) -> tensor<128xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
       } {
     %2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config}
         ins(%0 : tensor<128xf16>)
@@ -173,7 +169,7 @@ module {
 module {
   func.func @inferred_im2col(%2: tensor<2x34x34x128xf16>, %3: tensor<2x128x8xf16>) -> tensor<2x128x8xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64>
       } {
     %4 = iree_linalg_ext.im2col {lowering_config = #config}
       strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
@@ -198,7 +194,7 @@ module {
 module {
   func.func @inferred_im2col_batch_last(%2: tensor<16x26x18x32xbf16>, %3: tensor<32x1x1x32xbf16>) -> tensor<32x1x1x32xbf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
       } {
     %4 = iree_linalg_ext.im2col {lowering_config = #config}
       strides = [1, 1] dilations = [1, 1] kernel_size = [24, 16]
@@ -220,31 +216,29 @@ module {
 // -----
 
 #config = #iree_gpu.derived_thread_config
-func.func @scatter(%arg0: tensor<3x32x16xf32>, %arg1: tensor<3x1xi32>) -> tensor<3x32x16xf32>
+func.func @scatter(%arg0: tensor<3x32x16xf32>, %arg1: tensor<3x1xi32>, %arg2: tensor<3x32x16xf32>) -> tensor<3x32x16xf32>
       attributes {
-        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
       } {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<3x32x16xf32>
   %1 = iree_linalg_ext.scatter {lowering_config = #config} dimension_map = [0] unique_indices(true)
-    ins(%arg0, %arg1 : tensor<3x32x16xf32>, tensor<3x1xi32>) outs(%0 : tensor<3x32x16xf32>) {
-  ^bb0(%arg2: f32, %arg3: f32):
-    iree_linalg_ext.yield %arg2 : f32
+    ins(%arg0, %arg1 : tensor<3x32x16xf32>, tensor<3x1xi32>) outs(%arg2 : tensor<3x32x16xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    iree_linalg_ext.yield %in : f32
   } -> tensor<3x32x16xf32>
   return %1 : tensor<3x32x16xf32>
 }
 
 // CHECK-LABEL: func.func @scatter
 //       CHECK:   scf.forall ({{.*}}) = (0, 0, 0) to (3, 32, 16) step (1, 1, 4)
-//       CHECK:     linalg_ext.scatter
+//       CHECK:     iree_linalg_ext.scatter
 //       CHECK:     scf.forall.in_parallel
 
 // -----
 
 #config = #iree_gpu.derived_thread_config
 func.func @map_scatter(%arg0: tensor<2x32xf32>, %arg1: tensor<64x256xf32>) -> tensor<64x256xf32>
     attributes {
-      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32] subgroup_size = 64, {}>
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32] subgroup_size = 64>
     } {
   %true = arith.constant true
   %1 = iree_linalg_ext.map_scatter {lowering_config = #config} %arg0 into %arg1 {
 
@@ -69,8 +69,8 @@ func.func @online_attention_tile_then_pad(%query: tensor<192x1024x64xf32>, %key:
 
   //         CHECK: arith.constant 0xFF800000 : f32
   // CHECK-COUNT-3: tensor.pad
-  //         CHECK: iree_linalg_ext.online_attention {{.*}} ins(%{{[0-9a-z_]*}}, %{{[0-9a-z_]*}}, %{{[0-9a-z_]*}}, %{{[0-9a-z_]*}}, %{{[0-9a-z_]*}}
-  //    CHECK-SAME:   : tensor<192x1024x64xf32>, tensor<192x32x64xf32>, tensor<192x32x64xf32>, f32, tensor<192x1024x32xf32>
+  //         CHECK: iree_linalg_ext.online_attention {{.*}} ins(%{{[A-Za-z0-9_]+}}, %{{[A-Za-z0-9_]+}}, %{{[A-Za-z0-9_]+}}, %{{[A-Za-z0-9_]+}}, %{{[A-Za-z0-9_]+}}
+  //    CHECK-SAME:   : tensor<192x1024x64xf32>, tensor<192x32x64xf32>, tensor<192x32x64xf32>, f32, tensor<192x1024x32xf32>)
   %out:3 = iree_linalg_ext.online_attention
         {
           indexing_maps = [#mapQ, #mapK, #mapV, #mapS, #mapM, #mapO, #mapR, #mapR],
 
@@ -58,7 +58,7 @@ func.func @sum_exp_sub_reduction(%arg0: tensor<1x?xf32>, %arg1: tensor<1xf32>, %
 // So we check that the selected value in the padded region is one of the
 // NaN values.
 
-// CHECK-LABEL: max_reduction
+// CHECK-LABEL: func.func @max_reduction
 //   CHECK-DAG: %[[NANVAL:.+]]  = arith.constant 0xFFC00000 : f32
 //   CHECK-DAG: %[[C1:.+]]      = arith.constant 1 : index
 //   CHECK-DAG: %[[DIMARG0:.+]] = tensor.dim %arg0, %[[C1]] : tensor<1x?xf32>
@@ -86,7 +86,7 @@ func.func @max_reduction(%arg0: tensor<1x?xf32>, %arg1: tensor<1xf32>) -> tensor
 
 // -----
 
-// CHECK-LABEL: min_reduction
+// CHECK-LABEL: func.func @min_reduction
 //   CHECK-DAG: %[[NANVAL:.+]]  = arith.constant 0x7FC00000 : f32
 //   CHECK-DAG: %[[C1:.+]]      = arith.constant 1 : index
 //   CHECK-DAG: %[[DIMARG0:.+]] = tensor.dim %arg0, %[[C1]] : tensor<1x?xf32>
@@ -117,7 +117,7 @@ func.func @min_reduction(%arg0: tensor<1x?xf32>, %arg1: tensor<1xf32>) -> tensor
 
 // This reduction corresponds to a standard inner product.
 
-// CHECK-LABEL: standard_inner_product
+// CHECK-LABEL: func.func @standard_inner_product
 //   CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f16
 //   CHECK-DAG: %[[C1:.+]]      = arith.constant 1 : index
 //   CHECK-DAG: %[[DIMARG0:.+]] = tensor.dim %arg0, %[[C1]] : tensor<1x?xf16>
@@ -150,7 +150,7 @@ func.func @standard_inner_product(%arg0 : tensor<1x?xf16>, %arg1 : tensor<1x?xf1
 // Inner product where the accumulation (add) is in f16 but the multiplication is in f32
 // Check for an f16 zero as the reduction identity.
 
-// CHECK-LABEL: standard_inner_product_with_trunc
+// CHECK-LABEL: func.func @standard_inner_product_with_trunc
 //   CHECK-DAG: %[[ZERO:.+]]    = arith.constant 0.000000e+00 : f16
 //   CHECK-DAG: %[[C1:.+]]      = arith.constant 1 : index
 //   CHECK-DAG: %[[DIMARG0:.+]] = tensor.dim %arg0, %[[C1]] : tensor<1x?xf32>
@@ -189,7 +189,7 @@ func.func @standard_inner_product_with_trunc(%arg0 : tensor<1x?xf32>, %arg1 : te
 // In this example, the reduction type is multiplicative, so we check that
 // the value selected in the padded part of the iteration space is 1, the multiplicative identity.
 
-// CHECK-LABEL: product_of_sum_reduction
+// CHECK-LABEL: func.func @product_of_sum_reduction
 //       CHECK: %[[ONE:.+]] = arith.constant 1.000000e+00 : f16
 //       CHECK: %[[CMP:.+]] = arith.cmpi
 //       CHECK: %[[ADD:.+]] = arith.addf
@@ -215,7 +215,7 @@ func.func @product_of_sum_reduction(%arg0 : tensor<1x?xf16>, %arg1 : tensor<1x?x
 // Reductions in multiple dimensions have a 2-D region to check for padding.
 // Check for 2 compare ops, and an 'and' to combine them
 
-// CHECK-LABEL: multi_dim_reduction
+// CHECK-LABEL: func.func @multi_dim_reduction
 //  CHECK-SAME: (%[[ARG0:[0-9a-zA-Z]+]]: tensor<?x?xf16>, %
 //   CHECK-DAG: %[[ZEROF16:.+]] = arith.constant 0.000000e+00 : f16
 //   CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
@@ -253,11 +253,11 @@ func.func @multi_dim_reduction(%arg0 : tensor<?x?xf16>, %arg1 : tensor<?x?xf16>,
 
 // Multiple reductions in parallel in a linalg.generic op.
 
-// CHECK-LABEL: minmax_reduction
+// CHECK-LABEL: func.func @minmax_reduction
 //   CHECK-DAG: %[[NAN0:.+]] = arith.constant 0xFFC00000 : f32
 //   CHECK-DAG: %[[NAN1:.+]] = arith.constant 0x7FC00000 : f32
-//   CHECK-DAG: %[[SELECT0:.+]] = arith.select {{.*}} %[[NAN0]] : f32
-//   CHECK-DAG: %[[SELECT0:.+]] = arith.select {{.*}} %[[NAN1]] : f32
+//   CHECK-DAG: {{.+}} = arith.select {{.+}}, {{.+}}, %[[NAN0]] : f32
+//   CHECK-DAG: {{.+}} = arith.select {{.+}}, {{.+}}, %[[NAN1]] : f32
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 func.func @minmax_reduction(%arg0: tensor<1x?xf32>, %arg1: tensor<1xf32>, %arg2 : tensor<1xf32>) -> (tensor<1xf32>, tensor<1xf32>) {
@@ -280,7 +280,7 @@ func.func @minmax_reduction(%arg0: tensor<1x?xf32>, %arg1: tensor<1xf32>, %arg2
 // The reduction dimension is perfectly tiled by the partial reduction (1024 % 128 == 0).
 // We confirm that we directly optimize this case, where the arith.select condition is always true.
 
-// CHECK-LABEL: reduction_static_complete_tile
+// CHECK-LABEL: func.func @reduction_static_complete_tile
 //   CHECK-NOT: arith.select
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
@@ -301,7 +301,7 @@ func.func @reduction_static_complete_tile(%arg0: tensor<1x1024xf32>, %arg1: tens
 
 // The reduction dimension is not perfectly tiled by the partial reduction (1024 % 100 != 0).
 
-// CHECK-LABEL: reduction_static_incomplete_tile
+// CHECK-LABEL: func.func @reduction_static_incomplete_tile
 //       CHECK: arith.select
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>