@@ -136,7 +136,9 @@ func.func @vectorize_nd_tensor_extract_transfer_read_basic(
136136// CHECK: %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[IDX1]], %[[IDX2]], %[[C0:.*]]], %[[CST_0]] {in_bounds = [true, true, true]} : tensor<3x3x3xf32>, vector<1x1x3xf32>
137137// CHECK: vector.transfer_write %[[READ]], %[[ARG1]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
138138
139- // Same as example above, but reading into a column tensor.
139+ // Same as example above, but reading into a column tensor. Note that after the
140+ // vectorizatoin, the `TransferOpReduceRank` will replace
141+ // `vector.transfer_read` with `tensor.extract -> scalar`.
140142
141143// TODO: Currently this fails to vectorise when the indices are non-constant.
142144
@@ -160,10 +162,9 @@ func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
160162// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
161163// CHECK-SAME: %[[INPUT:.*]]: tensor<3x3x3xf32>,
162164// CHECK-SAME: %[[OUTPUT:.*]]: tensor<3x1x1xf32>)
163- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
164- // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
165- // CHECK: %[[READ:.*]] = vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32>
166- // CHECK: %[[BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32>
165+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
166+ // CHECK: %[[EXTRACT:.*]] = tensor.extract %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : tensor<3x3x3xf32>
167+ // CHECK: %[[BCAST:.*]] = vector.broadcast %[[EXTRACT]] : f32 to vector<3x1x1xf32>
167168// CHECK: %[[RES:.*]] = vector.transfer_write %[[BCAST]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>
168169// CHECK: return %[[RES]] : tensor<3x1x1xf32>
169170
@@ -540,9 +541,8 @@ func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20
540541// CHECK-SAME: %[[INPUT_2:.*]]: tensor<257x24xf32>,
541542// CHECK: %[[EXTRACTED_0_IDX_0:.*]] = arith.constant 0 : index
542543// CHECK: %[[EXTRACTED_0_IDX_1:.*]] = vector.extractelement %{{.*}}[%{{.*}} : i32] : vector<4xindex>
543- // First `vector.transfer_read` from the generic Op - loop invariant scalar load.
544- // CHECK: vector.transfer_read %[[INPUT_1]][%[[EXTRACTED_0_IDX_0]], %[[EXTRACTED_0_IDX_1]]]
545- // CHECK-SAME: tensor<1x20xi32>, vector<i32>
544+ // First `tensor.extract` from the generic Op - loop invariant scalar load.
545+ // CHECK: tensor.extract %[[INPUT_1]][%[[EXTRACTED_0_IDX_0]], %[[EXTRACTED_0_IDX_1]]] : tensor<1x20xi32>
546546// The following `tensor.extract` from the generic Op s a contiguous load (all Ops used
547547// for address calculation also satisfy the required conditions).
548548// CHECK: vector.transfer_read %[[INPUT_2]][%{{.*}}, %{{.*}}, %{{.*}} {in_bounds = [true, true]} : tensor<257x24xf32>, vector<1x4xf32>
@@ -745,8 +745,8 @@ func.func @vectorize_0d_tensor_extract(%arg0: tensor<f32>, %arg2: tensor<1x1x3xf
745745
746746// CHECK-LABEL: func.func @vectorize_0d_tensor_extract(
747747// CHECK-SAME: %[[ARG_0:.*]]: tensor<f32>
748- // CHECK: %[[EXTRACT:.*]] = vector.transfer_read %[[ARG_0]][], %{{.+}} : tensor<f32>
749- // CHECK: vector.broadcast %[[EXTRACT]] : vector< f32> to vector<1x1x3xf32>
748+ // CHECK: %[[EXTRACT:.*]] = tensor.extract %[[ARG_0]][] : tensor<f32>
749+ // CHECK: vector.broadcast %[[EXTRACT]] : f32 to vector<1x1x3xf32>
750750
751751module attributes {transform.with_named_sequence } {
752752 transform.named_sequence @__transform_main (%arg1: !transform.any_op {transform.readonly }) {
0 commit comments