@@ -9,13 +9,9 @@ func.func @memref_copy(%source: memref<2x2xf32>, %dest: memref<2x2xf32>) {
99// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<2x2xf32>
1010// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<2x2xf32>
1111// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
12- // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
13- // CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
14- // CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
15- // CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
16- // CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
17- // CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
18- // CHECK: vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
12+ // CHECK-DAG: %[[POISON:.+]] = ub.poison : f32
13+ // CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE]][%[[C0]], %[[C0]]], %[[POISON]] {in_bounds = [true, true]} : memref<2x2xf32>, vector<2x2xf32>
14+ // CHECK: vector.transfer_write %[[RD]], %[[DEST]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<2x2xf32>, memref<2x2xf32>
1915
2016// -----
2117
@@ -28,13 +24,9 @@ func.func @linalg_copy(%source: memref<2x2xf32>, %dest: memref<2x2xf32>) {
2824// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<2x2xf32>
2925// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<2x2xf32>
3026// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
31- // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
32- // CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
33- // CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
34- // CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
35- // CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
36- // CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
37- // CHECK: vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
27+ // CHECK-DAG: %[[POISON:.+]] = ub.poison : f32
28+ // CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE]][%[[C0]], %[[C0]]], %[[POISON]] {in_bounds = [true, true]} : memref<2x2xf32>, vector<2x2xf32>
29+ // CHECK: vector.transfer_write %[[RD]], %[[DEST]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<2x2xf32>, memref<2x2xf32>
3830
3931// -----
4032
@@ -44,6 +36,7 @@ func.func @memref_copy_not_multiple_of_preferred(%source: memref<2x6xf32>, %dest
4436 memref.copy %source , %dest : memref <2 x6 xf32 > to memref <2 x6 xf32 >
4537 return
4638}
39+
4740// CHECK-LABEL: func.func @memref_copy_not_multiple_of_preferred
4841// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<2x6xf32>
4942// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<2x6xf32>
@@ -74,11 +67,10 @@ func.func @memref_copy_not_multiple_on_penultimate_dim(%source: memref<3x2xf32>,
7467// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
7568// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
7669// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C3]] step %[[C2]]
77- // CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
78- // CHECK: %[[MIN:.+]] = affine.min affine_map<(d0) -> (-d0 + 3, 2)>(%[[ARG2]])
79- // CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [%[[MIN]], 2] [1, 1]
80- // CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [%[[MIN]], 2] [1, 1]
81- // CHECK: memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
70+ // CHECK: %[[MIN:.+]] = affine.min affine_map<(d0) -> (-d0 + 3, 2)>(%[[ARG2]])
71+ // CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], 0] [%[[MIN]], 2] [1, 1]
72+ // CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], 0] [%[[MIN]], 2] [1, 1]
73+ // CHECK: memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
8274
8375// -----
8476
@@ -91,14 +83,12 @@ func.func @memref_copy_dynamic(%source: memref<?x4xf32>, %dest: memref<?x4xf32>)
9183// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<?x4xf32>
9284// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
9385// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
94- // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
9586// CHECK-DAG: %[[DIM:.+]] = memref.dim %[[SOURCE]], %[[C0]] : memref<?x4xf32>
9687// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[DIM]] step %[[C1]]
97- // CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C4]] step %[[C4]]
98- // CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, 4] [1, 1]
99- // CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, 4] [1, 1]
100- // CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
101- // CHECK: vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
88+ // CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], 0] [1, 4] [1, 1]
89+ // CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], 0] [1, 4] [1, 1]
90+ // CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
91+ // CHECK: vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
10292
10393// -----
10494
@@ -119,3 +109,68 @@ func.func @memref_copy_dynamic_inner_dim(%source: memref<4x?xf32>, %dest: memref
119109// CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
120110// CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
121111// CHECK: memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
112+
113+ // -----
114+
115+ // Test that the single iteration loops are removed and the subview ops are canonicalized
116+ // (`memref<1x?xbf16, strided<[4, 1]>` instead of `memref<1x?xbf16, strided<[4, 1], offset: ?>`).
117+
118+ func.func @memref_copy_fully_dynamic (%source: memref <1 x4 xbf16 >, %dest: memref <32 x?xbf16 , strided <[40 , 1 ], offset : ?>>, %dim: index ) {
119+ %c0 = arith.constant 0 : index
120+ scf.forall (%arg0 ) in (3 ) {
121+ %0 = affine.min affine_map <(d0 ) -> (d0 * -16 + 40 , 16 )>(%arg0 )
122+ %1:2 = affine.delinearize_index %dim into (2 , 64 ) : index , index
123+ %2:3 = affine.delinearize_index %1#1 into (4 , 16 ) : index , index , index
124+ %3 = affine.linearize_index disjoint [%2 #1 , %c0 ] by (4 , 4 ) : index
125+ %4 = affine.linearize_index disjoint [%1 #0 , %2 #2 ] by (2 , 16 ) : index
126+ %5 = affine.max affine_map <()[s0 ] -> (-s0 + 32 , 0 )>()[%4 ]
127+ %6 = affine.min affine_map <()[s0 ] -> (1 , s0 )>()[%5 ]
128+ %7 = affine.max affine_map <(d0 )[s0 ] -> (0 , d0 - s0 )>(%0 )[%3 ]
129+ %8 = affine.min affine_map <(d0 ) -> (4 , d0 )>(%7 )
130+ %subview_0 = memref.subview %source [0 , 0 ] [%6 , %8 ] [1 , 1 ] : memref <1 x4 xbf16 > to memref <?x?xbf16 , strided <[4 , 1 ]>>
131+ %subview_1 = memref.subview %dest [%4 , %3 ] [%6 , %8 ] [1 , 1 ] : memref <32 x?xbf16 , strided <[40 , 1 ], offset : ?>> to memref <?x?xbf16 , strided <[40 , 1 ], offset : ?>>
132+ memref.copy %subview_0 , %subview_1 : memref <?x?xbf16 , strided <[4 , 1 ]>> to memref <?x?xbf16 , strided <[40 , 1 ], offset : ?>>
133+ }
134+ return
135+ }
136+ // CHECK-LABEL: func.func @memref_copy_fully_dynamic
137+ // CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<1x4xbf16>
138+ // CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<32x?xbf16, strided<[40, 1], offset: ?>>
139+ // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
140+ // CHECK-DAG: %[[LIN_0:.+]] = affine.linearize_index disjoint [%{{.+}}, %{{.+}}] by (4, 4) : index
141+ // CHECK-DAG: %[[LIN_1:.+]] = affine.linearize_index disjoint [%{{.+}}, %{{.+}}] by (2, 16) : index
142+ // CHECK-DAG: %[[MIN_0:.+]] = affine.min affine_map<()[s0] -> (1, s0)>()[%{{.+}}]
143+ // CHECK-DAG: %[[MIN_1:.+]] = affine.min affine_map<(d0) -> (4, d0)>(%{{.+}})
144+ // CHECK-DAG: %[[SUBVIEW_0:.+]] = memref.subview %[[SOURCE]][0, 0] [%[[MIN_0]], %[[MIN_1]]] [1, 1]
145+ // CHECK-SAME: memref<1x4xbf16> to memref<?x?xbf16, strided<[4, 1]>>
146+ // CHECK-DAG: %[[SUBVIEW_1:.+]] = memref.subview %[[DEST]][%[[LIN_1]], %[[LIN_0]]] [%[[MIN_0]], %[[MIN_1]]] [1, 1]
147+ // CHECK-SAME: memref<32x?xbf16, strided<[40, 1], offset: ?>> to memref<?x?xbf16, strided<[40, 1], offset: ?>>
148+ // CHECK-DAG: %[[CMP_0:.+]] = arith.cmpi sgt, %[[MIN_0]], %[[C0]] : index
149+ // CHECK: scf.if %[[CMP_0]] {
150+ // CHECK: %[[CMP_1:.+]] = arith.cmpi sgt, %[[MIN_1]], %[[C0]] : index
151+ // CHECK: scf.if %[[CMP_1]] {
152+ // CHECK: %[[MIN_2:.+]] = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 8)>(%[[C0]])[%[[MIN_1]]]
153+ // CHECK: %[[SUBIEW_2:.+]] = memref.subview %[[SUBVIEW_0]][0, 0] [1, %[[MIN_2]]] [1, 1]
154+ // CHECK-SAME: memref<?x?xbf16, strided<[4, 1]>> to memref<1x?xbf16, strided<[4, 1]>>
155+ // CHECK: %[[SUBVIEW_3:.+]] = memref.subview %[[SUBVIEW_1]][0, 0] [1, %[[MIN_2]]] [1, 1]
156+ // CHECK-SAME: memref<?x?xbf16, strided<[40, 1], offset: ?>> to memref<1x?xbf16, strided<[40, 1], offset: ?>>
157+ // CHECK: memref.copy %[[SUBIEW_2]], %[[SUBVIEW_3]]
158+
159+ // -----
160+
161+ // Test that scf.for operations with `_is_tiled` attribute are simplified. The `memref.copy` should still be vectorized as well.
162+
163+ func.func @for_with_tiled_attr (%source: memref <4 x?xf32 >, %dest: memref <4 x?xf32 >) {
164+ %c0 = arith.constant 0 : index
165+ %c1 = arith.constant 1 : index
166+ scf.for %arg0 = %c0 to %c1 step %c1 {
167+ %subview_0 = memref.subview %source [%arg0 , 0 ] [4 , 1 ] [1 , 1 ] : memref <4 x?xf32 > to memref <4 x1 xf32 , strided <[?, 1 ], offset : ?>>
168+ %subview_1 = memref.subview %dest [%arg0 , 0 ] [4 , 1 ] [1 , 1 ] : memref <4 x?xf32 > to memref <4 x1 xf32 , strided <[?, 1 ], offset : ?>>
169+ memref.copy %subview_0 , %subview_1 : memref <4 x1 xf32 , strided <[?, 1 ], offset : ?>> to memref <4 x1 xf32 , strided <[?, 1 ], offset : ?>>
170+ } {_is_tiled }
171+ return
172+ }
173+ // CHECK-LABEL: func.func @for_with_tiled_attr
174+ // CHECK-NOT: scf.for
175+ // CHECK: vector.transfer_read
176+ // CHECK: vector.transfer_write
0 commit comments