iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/2d_scalable_to_1d_scalable.mlir‎
Lines changed: 4 additions & 4 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/2d_scalable_to_1d_scalable.mlir‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir‎
Lines changed: 7 additions & 10 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion.mlir‎
Lines changed: 2 additions & 2 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion.mlir‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir‎
Lines changed: 4 additions & 7 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/expand_f16_op_to_f32.mlir‎
Lines changed: 3 additions & 3 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/expand_f16_op_to_f32.mlir‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir‎
Lines changed: 5 additions & 5 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir‎
Lines changed: 3 additions & 3 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/peel.mlir‎
Lines changed: 2 additions & 2 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/peel.mlir‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir‎
Lines changed: 11 additions & 29 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir‎
Lines changed: 11 additions & 29 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir‎
Lines changed: 0 additions & 1 deletion b/‎compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir‎
Lines changed: 0 additions & 1 deletion
@@ -91,12 +91,12 @@ func.func @scalable_2d_matmul_and_generic(%arg0: tensor<32400x32xf32>, %arg1: te
 
 #lowering_config_parallel_only =  #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel =  [[4], [4]]>
 
-// CHECK: #[[GENERIC_CONFIG:.*]] = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [4, [4]]>
-///
-//      CHECK: func.func @should_not_crash
+// CHECK: #[[$GENERIC_CONFIG:.*]] = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [4, [4]]>
+//
+// CHECK-LABEL: func.func @should_not_crash
 //      CHECK:   scf.for
 //      CHECK:         linalg.generic
-// CHECK-SAME:           lowering_config = #[[GENERIC_CONFIG]]
+// CHECK-SAME:           lowering_config = #[[$GENERIC_CONFIG]]
 func.func @should_not_crash(%a: tensor<?x?xf32>, %b: tensor<?xf32>, %c: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>],
 
@@ -56,9 +56,7 @@ func.func @dot_384x512x128_dispatch_0() {
   }
   return
 }
-//      CHECK: #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 64)>
 //      CHECK: func.func @dot_384x512x128_dispatch_0() {
-//  CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f32
 //  CHECK-DAG: %[[CST_VECTOR:.+]] = arith.constant dense<0.000000e+00> : vector<16x16xf32>
 //  CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG: %[[C384:.+]] = arith.constant 384 : index
@@ -69,25 +67,24 @@ func.func @dot_384x512x128_dispatch_0() {
 //  CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
 //      CHECK: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<384x512xf32>>
 //      CHECK: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<512x128xf32>>
-//      CHECK: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<384x128xf32>>
 //      CHECK: %[[DST_TILE_INIT:.+]] = tensor.empty()
-//      CHECK: scf.for %[[I_IDX:.+]] = {{.*}} to %[[C384]] step %{{[0-9]*}} {
+//      CHECK: scf.for {{.+}} = {{.*}} to %[[C384]] step %{{[0-9]*}} {
 //      CHECK:   %[[LHS_TILE:.+]] = iree_tensor_ext.dispatch.tensor.load %[[LHS]], {{.*}} -> tensor<64x512xf32>
-//      CHECK:   scf.for %[[J_IDX:.+]] = {{.*}} to %[[C128]] step %{{[0-9]*}} {
+//      CHECK:   scf.for {{.+}} = {{.*}} to %[[C128]] step %{{[0-9]*}} {
 //      CHECK:     %[[RHS_TILE:.+]] = iree_tensor_ext.dispatch.tensor.load %[[RHS]], {{.*}} -> tensor<512x64xf32>
 //      CHECK:     {{.*}} = scf.for %[[L1_I:.+]] = %[[C0]] to %[[C64]] step %[[C16]]
 // CHECK-SAME:       iter_args(%[[ITER0:.+]] = %[[DST_TILE_INIT]]) -> (tensor<64x64xf32>)
 //      CHECK:       {{.*}} = scf.for %[[L1_J:.+]] = %[[C0]] to %[[C64]] step %[[C16]]
 // CHECK-SAME:         iter_args(%[[ITER1:.+]] = %[[ITER0]]) -> (tensor<64x64xf32>)
-//      CHECK:         %[[MATMUL_RES:.+]] = scf.for %[[L1_K:.+]] = %[[C0]] to %[[C512]] step %[[C32]]
-// CHECK-SAME:           iter_args(%[[ITER2:.+]] = %[[CST_VECTOR]]) -> (vector<16x16xf32>)
+//      CHECK:         %[[MATMUL_RES:.+]] = scf.for {{.+}} = %[[C0]] to %[[C512]] step %[[C32]]
+// CHECK-SAME:           iter_args({{.+}} = %[[CST_VECTOR]]) -> (vector<16x16xf32>)
 //  CHECK-DAG:           {{.*}} = tensor.extract %[[LHS_TILE]]
-//  CHECK-DAD:           {{.*}} = vector.transfer_read %[[RHS_TILE]]
+//  CHECK-DAG:           {{.*}} = vector.transfer_read %[[RHS_TILE]]
 // CHECK-COUNT-32:       vector.fma
 //      CHECK:           scf.yield %{{.*}} : vector<16x16xf32>
 //      CHECK:         %[[EXP:.+]] = math.exp %[[MATMUL_RES]] : vector<16x16xf32>
-//      CHECK:         %[[RES:.+]] = vector.transfer_write %[[EXP]], %[[ITER1]][%[[L1_I]], %[[L1_J]]] {{.*}} : vector<16x16xf32>, tensor<64x64xf32>
-//      CHECK:         scf.yield %[[RES]]
+//      CHECK:         {{.+}} = vector.transfer_write %[[EXP]], %[[ITER1]][%[[L1_I]], %[[L1_J]]] {{.*}} : vector<16x16xf32>, tensor<64x64xf32>
+//      CHECK:         scf.yield {{.+}}
 
 // -----
 
 
@@ -9,7 +9,7 @@ func.func @dynamic_allocas(%arg0: index) {
 // -----
 
 // expected-error @+1 {{exceeded stack allocation limit of 32768 bytes for function. Got 65536 bytes}}
-func.func @static_big_allocas(%arg0: index) {
+func.func @static_big_allocas() {
   %0 = memref.alloca() : memref<16384xi32>
   return
 }
@@ -36,7 +36,7 @@ func.func @mix_static_and_dynamic_allocas(%arg0: index) {
 
 // -----
 
-func.func @non_entry_bb_allocas(%arg0: index) {
+func.func @non_entry_bb_allocas() {
   cf.br ^bb1
  ^bb1() :
   // expected-error @+1 {{all stack allocations need to be hoisted to the entry block of the function}}
 
@@ -8,9 +8,9 @@ builtin.module {
 }
 //      CHECK: llvm.func @extern_public()
 //      CHECK: llvm.func @entry_point(
-// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef},
-// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef},
-// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32
+// CHECK-SAME: %{{[A-Za-z0-9_]+}}: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef},
+// CHECK-SAME: %{{[A-Za-z0-9_]+}}: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef},
+// CHECK-SAME: %{{[A-Za-z0-9_]+}}: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32
 //      CHECK:     llvm.return %{{.+}} : i32
 
 // -----
@@ -39,7 +39,7 @@ module {
 //  CHECK-DAG:   %[[PROCESSOR_INFO:.+]] = llvm.load %arg2
 //      CHECK:   %[[PROCESSOR_ID:.+]] = llvm.extractvalue %[[PROCESSOR_INFO]][4]
 //      CHECK: %[[VAL:.+]] = llvm.call @default_cconv_with_extra_fields
-// CHECK-SAME:     (%[[ALLOCA]], %[[Ci32]], %[[Cf64]], %[[DATA]], %[[PROCESSOR_ID]])
+// CHECK-SAME: (%[[ALLOCA]], %[[Ci32]], %[[Cf64]], %[[DATA]], %[[PROCESSOR_ID]])
 
 // -----
 
@@ -51,9 +51,6 @@ func.func @interleave_and_bitcast_lowering() {
   %cst = arith.constant dense<4> : vector<4x2xi8>
   %cst_0 = arith.constant dense<0> : vector<4x4xi4>
   %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
   %c4096 = arith.constant 4096 : index
   %c8192 = arith.constant 8192 : index
   %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c4096) flags(ReadOnly) : memref<128xi8, strided<[1], offset: ?>>
 
@@ -20,6 +20,6 @@ func.func @maximumf(%arg0: tensor<4xf16>, %arg1: tensor<4xf16>, %arg2: tensor<4x
 // CHECK:           %[[LHS:.*]] = arith.extf %{{.+}} : f16 to f32
 // CHECK:           %[[RHS:.*]] = arith.extf %{{.+}} : f16 to f32
 // CHECK:           %[[MAX:.*]] = arith.maximumf %[[LHS]], %[[RHS]] : f32
-// CHECK:           %[[TRUNC:.*]] = arith.truncf %[[MAX]] : f32 to f16
-// CHECK:           linalg.yield %[[TRUNC:.*]] : f16
-// CHECK:         return %[[GEN:.*]] : tensor<4xf16>
+// CHECK:           %[[TRUNC:.+]] = arith.truncf %[[MAX]] : f32 to f16
+// CHECK:           linalg.yield %[[TRUNC]] : f16
+// CHECK:         return %[[GEN]] : tensor<4xf16>
@@ -5,7 +5,7 @@
   #hal.pipeline.binding<storage_buffer>
 ]>
 
-// CHECK-LABEL: llvm.func @binding_ptrs(
+// CHECK-LABEL: llvm.func @binding_ptrs
 func.func @binding_ptrs() {
   // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2
   // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5
@@ -43,7 +43,7 @@ llvm.func @sink(%arg0: f32) {
   #hal.pipeline.binding<storage_buffer>
 ]>
 
-// CHECK-LABEL: llvm.func @binding_ptrs_dynamic(
+// CHECK-LABEL: llvm.func @binding_ptrs_dynamic
 func.func @binding_ptrs_dynamic() {
   // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 :
   // CHECK-DAG: %[[C8:.+]] = llvm.mlir.constant(8 :
@@ -68,7 +68,7 @@ func.func @binding_ptrs_dynamic() {
   // CHECK: %[[DIM2_PTR:.+]] = llvm.getelementptr %[[CONSTANT_BASEPTR0]][3]
   // CHECK: %[[DIM2:.+]] = llvm.load %[[DIM2_PTR]]
   // CHECK: %[[DIM2_ZEXT:.+]] = llvm.zext %[[DIM2]]
-  %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1): index
+  %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index
   %dim1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index
   %dim2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index
 
@@ -109,7 +109,7 @@ llvm.func @sink(%arg0: f32) {
   #hal.pipeline.binding<storage_buffer>
 ]>
 
-// CHECK-LABEL: llvm.func @binding_ptrs_sub_byte_dynamic(
+// CHECK-LABEL: llvm.func @binding_ptrs_sub_byte_dynamic
 func.func @binding_ptrs_sub_byte_dynamic() {
   // CHECK-DAG: %[[C8:.+]] = llvm.mlir.constant(8 :
   // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 :
@@ -119,7 +119,7 @@ func.func @binding_ptrs_sub_byte_dynamic() {
   // CHECK: %[[OFFSET:.+]] = llvm.load %[[CONSTANT_BASEPTR]]
   // CHECK: %[[OFFSET_ZEXT:.+]] = llvm.zext %[[OFFSET]]
   %offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index
-  %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1): index
+  %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index
 
   // CHECK: %[[STATE3:.+]] = llvm.load %arg1
   // CHECK: %[[BINDING_PTRS:.+]] = llvm.extractvalue %[[STATE3]][10]
 
@@ -1,6 +1,6 @@
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --verify-diagnostics --split-input-file %s
 
-#config = #iree_cpu.lowering_config<distribution = [64, 64], vector_common_parallel = [8, 32, 16], vector_reduction = [0, 0, 16], vector_inner_parallel = [0, 0, 0]>
+#config = #iree_cpu.lowering_config<distribution = [64, 64], vector_common_parallel = [8, 32, 16], vector_reduction = [0, 0, 16]>
 #translation = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
 func.func @illegal_parallel_tile_sizes_config(%0: memref<4x8xf32>, %1: memref<8x16xf32>, %2: memref<4x16xf32>) attributes {
   translation_info = #translation
@@ -12,7 +12,7 @@ func.func @illegal_parallel_tile_sizes_config(%0: memref<4x8xf32>, %1: memref<8x
 
 // -----
 
-#config = #iree_cpu.lowering_config<distribution = [64, 64], vector_common_parallel = [8, 0, 0], vector_reduction = [0, 16, 16], vector_inner_parallel = [0, 0, 0]>
+#config = #iree_cpu.lowering_config<distribution = [64, 64], vector_common_parallel = [8, 0, 0], vector_reduction = [0, 16, 16]>
 #translation = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
 func.func @illegal_reduction_tile_sizes_config(%0: memref<4x8xf32>, %1: memref<8x16xf32>, %2: memref<4x16xf32>) attributes {
   translation_info = #translation
@@ -24,7 +24,7 @@ func.func @illegal_reduction_tile_sizes_config(%0: memref<4x8xf32>, %1: memref<8
 
 // -----
 
-#config = #iree_cpu.lowering_config<distribution = {sizes = [4, 8], interchange = [1]}, vector_common_parallel = [8, 8, 0], vector_reduction = [0, 0, 8], vector_inner_parallel = [0, 0, 0]>
+#config = #iree_cpu.lowering_config<distribution = {sizes = [4, 8], interchange = [1]}, vector_common_parallel = [8, 8, 0], vector_reduction = [0, 0, 8]>
 #translation = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
 func.func @illegal_interchange(%0: memref<4x8xf32>, %1: memref<8x16xf32>, %2: memref<4x16xf32>) attributes {
   translation_info = #translation
 
@@ -66,11 +66,11 @@ func.func @peel_static_matmul(%arg0: tensor<128x49xf32>, %arg1: tensor<49x512xf3
 // CHECK:                 linalg.matmul {{.*}} outs(%[[T0]] : tensor<8x32xf32>) -> tensor<8x32xf32>
 // CHECK:               scf.for
 // CHECK:                 linalg.fill {{.*}} -> tensor<8x?xf32>
-// CHECK:                 %[[T1:.+]] = scf.for
+// CHECK:                 scf.for
 // CHECK:                   linalg.matmul {{.*}} tensor<8x?xf32>
 // CHECK:               scf.for
 // CHECK:                 linalg.fill {{.*}} -> tensor<?x?xf32>
-// CHECK:                 %[[T2:.+]] = scf.for
+// CHECK:                 scf.for
 // CHECK:                   linalg.matmul {{.*}} tensor<?x?xf32>
 
 // -----
 
@@ -1,45 +1,27 @@
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s
 
-#pipeline_layout = #hal.pipeline.layout<constants = 5, bindings = [
+#pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
 func.func @pad_conv_2d_nchw_fchw_1x320x64x64x320x3x3() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
   %cst = arith.constant 0.000000e+00 : f32
-  %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
-  %c5243520 = arith.constant 5243520 : index
-  %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
-  %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
-  %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
-  %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
-  %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
-  %5 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [10486400 : index, 15729280 : index]} : i32 to index
-  %6 = arith.index_castui %1 {stream.alignment = 256 : index, stream.values = [1273222400 : index, 1280618240 : index]} : i32 to index
-  %7 = arith.index_castui %2 {stream.alignment = 256 : index, stream.values = [10507520 : index, 21488640 : index]} : i32 to index
-  %8 = arith.index_castui %3 {stream.alignment = 256 : index, stream.values = [10508800 : index, 21489920 : index]} : i32 to index
-  %9 = arith.index_castui %4 {stream.alignment = 128 : index, stream.values = [10486400 : index, 10487680 : index]} : i32 to index
-  %10 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c5243520) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320x64x64xf32>>
-  %11 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<320x320x3x3xf32>>
-  %12 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%7) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320xf32>>
-  %13 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%8) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320xf32>>
-  %14 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%5) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320xf32>>
-  %15 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%9) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x320x64x64xf32>>
-  %16 = iree_tensor_ext.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320x64x64xf32>> -> tensor<1x320x64x64xf32>
-  %17 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<320x320x3x3xf32>> -> tensor<320x320x3x3xf32>
-  %18 = iree_tensor_ext.dispatch.tensor.load %12, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
-  %19 = iree_tensor_ext.dispatch.tensor.load %13, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
-  %20 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
-  %21 = tensor.empty() : tensor<1x320x64x64xf32>
-  %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32>
-  %padded = tensor.pad %16 low[0, 0, 1, 1] high[0, 0, 1, 1] {
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320x64x64xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<320x320x3x3xf32>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x320x64x64xf32>>
+  %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x320x64x64xf32>> -> tensor<1x320x64x64xf32>
+  %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<320x320x3x3xf32>> -> tensor<320x320x3x3xf32>
+  %5 = tensor.empty() : tensor<1x320x64x64xf32>
+  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32>
+  %padded = tensor.pad %3 low[0, 0, 1, 1] high[0, 0, 1, 1] {
   ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
     tensor.yield %cst : f32
   } : tensor<1x320x64x64xf32> to tensor<1x320x66x66xf32>
-  %23 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded, %17 : tensor<1x320x66x66xf32>, tensor<320x320x3x3xf32>) outs(%22 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32>
-  iree_tensor_ext.dispatch.tensor.store %23, %15, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : tensor<1x320x64x64xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x320x64x64xf32>>
+  %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded, %4 : tensor<1x320x66x66xf32>, tensor<320x320x3x3xf32>) outs(%6 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : tensor<1x320x64x64xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x320x64x64xf32>>
   return
 }
 
 
@@ -55,7 +55,6 @@ func.func @pad_only_dispatch() attributes {hal.executable.target = #executable_t
 #map1 = affine_map<(d0, d1, d2, d3) -> (d3)>
 func.func @pad_with_producer_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
   %c0 = arith.constant 0 : index
-  %cst = arith.constant 1.001000e-05 : f32
   %cst_0 = arith.constant 0.000000e+00 : f32
   %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x56x56x256xf32>>
   %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x1x256x128xf32>>