@@ -14,14 +14,14 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x
1414 %c0 = arith.constant 0 : index
1515 %5 = tensor.empty () : tensor <1024 x512 xi32 >
1616 %6 = tensor.empty () : tensor <16 x32 x64 x64 xi32 >
17- %pack = tensor .pack %arg0 inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %6 : tensor <1024 x2048 xi32 > -> tensor <16 x32 x64 x64 xi32 >
17+ %pack = linalg .pack %arg0 inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %6 : tensor <1024 x2048 xi32 > -> tensor <16 x32 x64 x64 xi32 >
1818 %7 = tensor.empty () : tensor <32 x8 x64 x64 xi32 >
19- %pack_0 = tensor .pack %arg1 outer_dims_perm = [0 , 1 ] inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %7 : tensor <2048 x512 xi32 > -> tensor <32 x8 x64 x64 xi32 >
19+ %pack_0 = linalg .pack %arg1 outer_dims_perm = [0 , 1 ] inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %7 : tensor <2048 x512 xi32 > -> tensor <32 x8 x64 x64 xi32 >
2020 %8 = tensor.empty () : tensor <16 x8 x64 x64 xi32 >
2121 %9 = tensor.empty () : tensor <16 x32 x16 x8 x4 x8 xi32 >
22- %pack_1 = tensor .pack %pack inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %9 : tensor <16 x32 x64 x64 xi32 > -> tensor <16 x32 x16 x8 x4 x8 xi32 >
22+ %pack_1 = linalg .pack %pack inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %9 : tensor <16 x32 x64 x64 xi32 > -> tensor <16 x32 x16 x8 x4 x8 xi32 >
2323 %10 = tensor.empty () : tensor <32 x8 x8 x8 x8 x8 xi32 >
24- %pack_2 = tensor .pack %pack_0 inner_dims_pos = [3 , 2 ] inner_tiles = [8 , 8 ] into %10 : tensor <32 x8 x64 x64 xi32 > -> tensor <32 x8 x8 x8 x8 x8 xi32 >
24+ %pack_2 = linalg .pack %pack_0 inner_dims_pos = [3 , 2 ] inner_tiles = [8 , 8 ] into %10 : tensor <32 x8 x64 x64 xi32 > -> tensor <32 x8 x8 x8 x8 x8 xi32 >
2525 %11 = tensor.empty () : tensor <16 x8 x16 x8 x4 x8 xi32 >
2626 %12 = linalg.fill ins (%c0_i32 : i32 ) outs (%11 : tensor <16 x8 x16 x8 x4 x8 xi32 >) -> tensor <16 x8 x16 x8 x4 x8 xi32 >
2727 %13 = linalg.generic {index ing_maps = [#map , #map1 , #map2 ], iterator_types = [" parallel" , " parallel" , " reduction" , " parallel" , " parallel" , " reduction" , " parallel" , " parallel" , " reduction" ]} ins (%pack_1 , %pack_2 : tensor <16 x32 x16 x8 x4 x8 xi32 >, tensor <32 x8 x8 x8 x8 x8 xi32 >) outs (%12 : tensor <16 x8 x16 x8 x4 x8 xi32 >) {
@@ -30,63 +30,63 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x
3030 %15 = arith.addi %out , %14 : i32
3131 linalg.yield %15 : i32
3232 } -> tensor <16 x8 x16 x8 x4 x8 xi32 >
33- %unpack = tensor .unpack %13 inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %8 : tensor <16 x8 x16 x8 x4 x8 xi32 > -> tensor <16 x8 x64 x64 xi32 >
34- %unpack_3 = tensor .unpack %unpack inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %5 : tensor <16 x8 x64 x64 xi32 > -> tensor <1024 x512 xi32 >
33+ %unpack = linalg .unpack %13 inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %8 : tensor <16 x8 x16 x8 x4 x8 xi32 > -> tensor <16 x8 x64 x64 xi32 >
34+ %unpack_3 = linalg .unpack %unpack inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %5 : tensor <16 x8 x64 x64 xi32 > -> tensor <1024 x512 xi32 >
3535 return %unpack_3 : tensor <1024 x512 xi32 >
3636}
3737
3838// LINALG-INPUT-OUTPUT-NOT: memref.alloc
39- // LINALG-INPUT-OUTPUT: tensor .pack
39+ // LINALG-INPUT-OUTPUT: linalg .pack
4040// LINALG-INPUT-OUTPUT-NOT: memref.alloc
41- // LINALG-INPUT-OUTPUT: tensor .pack
41+ // LINALG-INPUT-OUTPUT: linalg .pack
4242// LINALG-INPUT-OUTPUT: memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
4343// LINALG-INPUT-OUTPUT: bufferization.to_tensor
44- // LINALG-INPUT-OUTPUT: tensor .pack
44+ // LINALG-INPUT-OUTPUT: linalg .pack
4545// LINALG-INPUT-OUTPUT: memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
4646// LINALG-INPUT-OUTPUT: bufferization.to_tensor
47- // LINALG-INPUT-OUTPUT: tensor .pack
47+ // LINALG-INPUT-OUTPUT: linalg .pack
4848// LINALG-INPUT-OUTPUT: memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
4949// LINALG-INPUT-OUTPUT: bufferization.to_tensor
5050// LINALG-INPUT-OUTPUT: linalg.fill
5151// LINALG-INPUT-OUTPUT: linalg.generic
5252
5353// LINALG-INPUT-NOT: memref.alloc
54- // LINALG-INPUT: tensor .pack
54+ // LINALG-INPUT: linalg .pack
5555// LINALG-INPUT-NOT: memref.alloc
56- // LINALG-INPUT: tensor .pack
56+ // LINALG-INPUT: linalg .pack
5757// LINALG-INPUT: memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
5858// LINALG-INPUT: bufferization.to_tensor
59- // LINALG-INPUT: tensor .pack
59+ // LINALG-INPUT: linalg .pack
6060// LINALG-INPUT: memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
6161// LINALG-INPUT: bufferization.to_tensor
62- // LINALG-INPUT: tensor .pack
62+ // LINALG-INPUT: linalg .pack
6363// LINALG-INPUT-NOT: memref.alloc
6464// LINALG-INPUT: linalg.fill
6565// LINALG-INPUT: linalg.generic
6666
6767// LINALG-OUTPUT-NOT: memref.alloc
68- // LINALG-OUTPUT: tensor .pack
68+ // LINALG-OUTPUT: linalg .pack
6969// LINALG-OUTPUT-NOT: memref.alloc
70- // LINALG-OUTPUT: tensor .pack
70+ // LINALG-OUTPUT: linalg .pack
7171// LINALG-OUTPUT-NOT: memref.alloc
72- // LINALG-OUTPUT: tensor .pack
72+ // LINALG-OUTPUT: linalg .pack
7373// LINALG-OUTPUT-NOT: memref.alloc
74- // LINALG-OUTPUT: tensor .pack
74+ // LINALG-OUTPUT: linalg .pack
7575// LINALG-OUTPUT: memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
7676// LINALG-OUTPUT: bufferization.to_tensor
7777// LINALG-OUTPUT: linalg.fill
7878// LINALG-OUTPUT: linalg.generic
7979
8080// PACK-INPUT: memref.alloc() : memref<16x32x64x64xi32, 1 : i32>
8181// PACK-INPUT: bufferization.to_tensor
82- // PACK-INPUT: tensor .pack
82+ // PACK-INPUT: linalg .pack
8383// PACK-INPUT: memref.alloc() : memref<32x8x64x64xi32, 1 : i32>
8484// PACK-INPUT: bufferization.to_tensor
85- // PACK-INPUT: tensor .pack
85+ // PACK-INPUT: linalg .pack
8686// PACK-INPUT-NOT: memref.alloc
87- // PACK-INPUT: tensor .pack
87+ // PACK-INPUT: linalg .pack
8888// PACK-INPUT-NOT: memref.alloc
89- // PACK-INPUT: tensor .pack
89+ // PACK-INPUT: linalg .pack
9090// PACK-INPUT-NOT: memref.alloc
9191// PACK-INPUT: linalg.fill
9292// PACK-INPUT: linalg.generic
@@ -105,14 +105,14 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024
105105 %extracted_slice_0 = tensor.extract_slice %arg1 [0 , %arg4 ] [512 , 64 ] [1 , 1 ] : tensor <512 x1024 xi8 > to tensor <512 x64 xi8 >
106106 %extracted_slice_1 = tensor.extract_slice %0 [%arg3 , %arg4 ] [64 , 64 ] [1 , 1 ] : tensor <1024 x1024 xi32 > to tensor <64 x64 xi32 >
107107 %2 = tensor.empty () : tensor <1 x16 x64 x32 xi8 >
108- %pack = tensor .pack %extracted_slice inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 32 ] into %2 : tensor <64 x512 xi8 > -> tensor <1 x16 x64 x32 xi8 >
108+ %pack = linalg .pack %extracted_slice inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 32 ] into %2 : tensor <64 x512 xi8 > -> tensor <1 x16 x64 x32 xi8 >
109109 %3 = tensor.empty () : tensor <16 x1 x32 x64 xi8 >
110- %pack_2 = tensor .pack %extracted_slice_0 outer_dims_perm = [0 , 1 ] inner_dims_pos = [0 , 1 ] inner_tiles = [32 , 64 ] into %3 : tensor <512 x64 xi8 > -> tensor <16 x1 x32 x64 xi8 >
110+ %pack_2 = linalg .pack %extracted_slice_0 outer_dims_perm = [0 , 1 ] inner_dims_pos = [0 , 1 ] inner_tiles = [32 , 64 ] into %3 : tensor <512 x64 xi8 > -> tensor <16 x1 x32 x64 xi8 >
111111 %4 = tensor.empty () : tensor <1 x1 x64 x64 xi32 >
112112 %5 = tensor.empty () : tensor <1 x16 x4 x16 x4 x8 xi8 >
113- %pack_3 = tensor .pack %pack outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %5 : tensor <1 x16 x64 x32 xi8 > -> tensor <1 x16 x4 x16 x4 x8 xi8 >
113+ %pack_3 = linalg .pack %pack outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %5 : tensor <1 x16 x64 x32 xi8 > -> tensor <1 x16 x4 x16 x4 x8 xi8 >
114114 %6 = tensor.empty () : tensor <16 x1 x8 x4 x8 x8 xi8 >
115- %pack_4 = tensor .pack %pack_2 outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [8 , 8 ] into %6 : tensor <16 x1 x32 x64 xi8 > -> tensor <16 x1 x8 x4 x8 x8 xi8 >
115+ %pack_4 = linalg .pack %pack_2 outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [8 , 8 ] into %6 : tensor <16 x1 x32 x64 xi8 > -> tensor <16 x1 x8 x4 x8 x8 xi8 >
116116 %7 = tensor.empty () : tensor <1 x1 x8 x16 x4 x8 xi32 >
117117 %8 = linalg.fill ins (%c0_i32 : i32 ) outs (%7 : tensor <1 x1 x8 x16 x4 x8 xi32 >) -> tensor <1 x1 x8 x16 x4 x8 xi32 >
118118 %9 = linalg.generic {index ing_maps = [#map , #map1 , #map2 ], iterator_types = [" parallel" , " parallel" , " reduction" , " parallel" , " parallel" , " reduction" , " parallel" , " parallel" , " reduction" ]} ins (%pack_3 , %pack_4 : tensor <1 x16 x4 x16 x4 x8 xi8 >, tensor <16 x1 x8 x4 x8 x8 xi8 >) outs (%8 : tensor <1 x1 x8 x16 x4 x8 xi32 >) {
@@ -125,49 +125,49 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024
125125 } -> tensor <1 x1 x8 x16 x4 x8 xi32 >
126126 %extracted_slice_5 = tensor.extract_slice %arg2 [%arg3 , %arg4 ] [64 , 64 ] [1 , 1 ] : tensor <1024 x1024 xi32 > to tensor <64 x64 xi32 >
127127 %extracted_slice_6 = tensor.extract_slice %arg5 [%arg3 , %arg4 ] [64 , 64 ] [1 , 1 ] : tensor <1024 x1024 xi32 > to tensor <64 x64 xi32 >
128- %pack_7 = tensor .pack %extracted_slice_6 inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %4 : tensor <64 x64 xi32 > -> tensor <1 x1 x64 x64 xi32 >
129- %pack_8 = tensor .pack %extracted_slice_5 inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %4 : tensor <64 x64 xi32 > -> tensor <1 x1 x64 x64 xi32 >
130- %pack_9 = tensor .pack %pack_7 outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %7 : tensor <1 x1 x64 x64 xi32 > -> tensor <1 x1 x8 x16 x4 x8 xi32 >
131- %pack_10 = tensor .pack %pack_8 outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %7 : tensor <1 x1 x64 x64 xi32 > -> tensor <1 x1 x8 x16 x4 x8 xi32 >
128+ %pack_7 = linalg .pack %extracted_slice_6 inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %4 : tensor <64 x64 xi32 > -> tensor <1 x1 x64 x64 xi32 >
129+ %pack_8 = linalg .pack %extracted_slice_5 inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %4 : tensor <64 x64 xi32 > -> tensor <1 x1 x64 x64 xi32 >
130+ %pack_9 = linalg .pack %pack_7 outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %7 : tensor <1 x1 x64 x64 xi32 > -> tensor <1 x1 x8 x16 x4 x8 xi32 >
131+ %pack_10 = linalg .pack %pack_8 outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %7 : tensor <1 x1 x64 x64 xi32 > -> tensor <1 x1 x8 x16 x4 x8 xi32 >
132132 %10 = linalg.generic {index ing_maps = [#map3 , #map3 , #map3 ], iterator_types = [" parallel" , " parallel" , " parallel" , " parallel" , " parallel" , " parallel" ]} ins (%9 , %pack_10 : tensor <1 x1 x8 x16 x4 x8 xi32 >, tensor <1 x1 x8 x16 x4 x8 xi32 >) outs (%pack_9 : tensor <1 x1 x8 x16 x4 x8 xi32 >) {
133133 ^bb0 (%in: i32 , %in_12: i32 , %out: i32 ):
134134 %11 = arith.addi %in , %in_12 : i32
135135 linalg.yield %11 : i32
136136 } -> tensor <1 x1 x8 x16 x4 x8 xi32 >
137- %unpack = tensor .unpack %10 outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %4 : tensor <1 x1 x8 x16 x4 x8 xi32 > -> tensor <1 x1 x64 x64 xi32 >
138- %unpack_11 = tensor .unpack %unpack inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %extracted_slice_1 : tensor <1 x1 x64 x64 xi32 > -> tensor <64 x64 xi32 >
137+ %unpack = linalg .unpack %10 outer_dims_perm = [0 , 1 , 3 , 2 ] inner_dims_pos = [2 , 3 ] inner_tiles = [4 , 8 ] into %4 : tensor <1 x1 x8 x16 x4 x8 xi32 > -> tensor <1 x1 x64 x64 xi32 >
138+ %unpack_11 = linalg .unpack %unpack inner_dims_pos = [0 , 1 ] inner_tiles = [64 , 64 ] into %extracted_slice_1 : tensor <1 x1 x64 x64 xi32 > -> tensor <64 x64 xi32 >
139139 scf.forall.in_parallel {
140140 tensor.parallel_insert_slice %unpack_11 into %arg5 [%arg3 , %arg4 ] [64 , 64 ] [1 , 1 ] : tensor <64 x64 xi32 > into tensor <1024 x1024 xi32 >
141141 }
142142 } {mapping = [#gpu.block <y >, #gpu.block <x >]}
143143 return %1 : tensor <1024 x1024 xi32 >
144144}
145145
146- // ELEMENTWISE-INPUT-COUNT-4: tensor .pack
146+ // ELEMENTWISE-INPUT-COUNT-4: linalg .pack
147147// ELEMENTWISE-INPUT: linalg.fill
148148// ELEMENTWISE-INPUT: linalg.generic
149149// ELEMENTWISE-INPUT-NOT: memref.alloc
150- // ELEMENTWISE-INPUT: tensor .pack
150+ // ELEMENTWISE-INPUT: linalg .pack
151151// ELEMENTWISE-INPUT-NOT: memref.alloc
152- // ELEMENTWISE-INPUT: tensor .pack
152+ // ELEMENTWISE-INPUT: linalg .pack
153153// ELEMENTWISE-INPUT-NOT: memref.alloc
154- // ELEMENTWISE-INPUT: tensor .pack
154+ // ELEMENTWISE-INPUT: linalg .pack
155155// ELEMENTWISE-INPUT: memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
156156// ELEMENTWISE-INPUT: bufferization.to_tensor
157- // ELEMENTWISE-INPUT: tensor .pack
157+ // ELEMENTWISE-INPUT: linalg .pack
158158// ELEMENTWISE-INPUT: linalg.generic
159159
160- // ELEMENTWISE-INPUT-OUTPUT-COUNT-4: tensor .pack
160+ // ELEMENTWISE-INPUT-OUTPUT-COUNT-4: linalg .pack
161161// ELEMENTWISE-INPUT-OUTPUT: linalg.fill
162162// ELEMENTWISE-INPUT-OUTPUT: linalg.generic
163163// ELEMENTWISE-INPUT-OUTPUT-NOT: memref.alloc
164- // ELEMENTWISE-INPUT-OUTPUT: tensor .pack
164+ // ELEMENTWISE-INPUT-OUTPUT: linalg .pack
165165// ELEMENTWISE-INPUT-OUTPUT-NOT: memref.alloc
166- // ELEMENTWISE-INPUT-OUTPUT: tensor .pack
166+ // ELEMENTWISE-INPUT-OUTPUT: linalg .pack
167167// ELEMENTWISE-INPUT-OUTPUT: memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
168168// ELEMENTWISE-INPUT-OUTPUT: bufferization.to_tensor
169- // ELEMENTWISE-INPUT-OUTPUT: tensor .pack
169+ // ELEMENTWISE-INPUT-OUTPUT: linalg .pack
170170// ELEMENTWISE-INPUT-OUTPUT: memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
171171// ELEMENTWISE-INPUT-OUTPUT: bufferization.to_tensor
172- // ELEMENTWISE-INPUT-OUTPUT: tensor .pack
172+ // ELEMENTWISE-INPUT-OUTPUT: linalg .pack
173173// ELEMENTWISE-INPUT-OUTPUT: linalg.generic
0 commit comments