@@ -54,18 +54,20 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
54
54
// CHECK: func @test_expand_shape
55
55
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
56
56
func.func @test_expand_shape (%offset_i: index , %offset_j: index ) {
57
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16 , 3>
57
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16 , 3>
58
58
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
59
59
// CHECK: %[[C0:.*]] = arith.constant 0 : index
60
- // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
61
- // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]]
62
- // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3>
60
+ // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
61
+ // CHECK: %[[IDXL:.*]] = affine.linearize_index [%[[C0]], %[[C0]]] by (64, 64) : index
62
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDXM]]], %[[LOCAL]][%[[IDXL]]]
63
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
63
64
64
- %alloc = memref.alloc () : memref <64 x 64 x f16 , #gpu_lds_addrspace >
65
+ %alloc = memref.alloc () : memref <4096 x f16 , #gpu_lds_addrspace >
65
66
%mem = memref.alloc () : memref <8192 xf16 >
66
- %expand = memref.expand_shape %mem [[0 , 1 ]] output_shape [64 , 128 ] : memref <8192 xf16 > into memref <64 x128 xf16 >
67
+ %expand_mem = memref.expand_shape %mem [[0 , 1 ]] output_shape [64 , 128 ] : memref <8192 xf16 > into memref <64 x128 xf16 >
68
+ %expand_alloc = memref.expand_shape %alloc [[0 , 1 ]] output_shape [64 , 64 ] : memref <4096 xf16 , #gpu_lds_addrspace > into memref <64 x64 xf16 , #gpu_lds_addrspace >
67
69
%c0 = arith.constant 0 : index
68
- amdgpu.gather_to_lds %expand [%offset_i , %offset_j ], %alloc [%c0 , %c0 ]
70
+ amdgpu.gather_to_lds %expand_mem [%offset_i , %offset_j ], %expand_alloc [%c0 , %c0 ]
69
71
: vector <8 xf16 >, memref <64 x128 xf16 >, memref <64 x64 xf16 , #gpu_lds_addrspace >
70
72
func.return
71
73
}
@@ -80,15 +82,82 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
80
82
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
81
83
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
82
84
// CHECK: %[[C0:.*]] = arith.constant 0 : index
83
- // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
84
- // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]]
85
+ // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
86
+ // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64, 64) : index, index
87
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES_MEM]]#0, %[[INDICES_MEM]]#1], %[[LOCAL]][%[[INDICES_LDS]]#0, %[[INDICES_LDS]]#1]
85
88
// CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
86
89
87
90
%alloc = memref.alloc () : memref <64 x64 xf16 , #gpu_lds_addrspace >
91
+ %collapse_alloc = memref.collapse_shape %alloc [[0 , 1 ]] : memref <64 x64 xf16 , #gpu_lds_addrspace > into memref <4096 xf16 , #gpu_lds_addrspace >
88
92
%mem = memref.alloc () : memref <64 x128 xf16 >
89
- %collapse = memref.collapse_shape %mem [[0 , 1 ]] : memref <64 x128 xf16 > into memref <8192 xf16 >
93
+ %collapse_mem = memref.collapse_shape %mem [[0 , 1 ]] : memref <64 x128 xf16 > into memref <8192 xf16 >
90
94
%c0 = arith.constant 0 : index
91
- amdgpu.gather_to_lds %collapse [%offset_i ], %alloc [%c0 , %c0 ]
95
+ amdgpu.gather_to_lds %collapse_mem [%offset_i ], %collapse_alloc [%offset_j ]
96
+ : vector <8 xf16 >, memref <8192 xf16 >, memref <4096 xf16 , #gpu_lds_addrspace >
97
+ func.return
98
+ }
99
+
100
+
101
+ // -----
102
+
103
+ #gpu_lds_addrspace = 3
104
+
105
+
106
+ // CHECK: func @test_expand_shape_src_raw_buffer
107
+ // CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
108
+ func.func @test_expand_shape_src_raw_buffer (%mem : memref <8192 xf16 , #amdgpu.address_space <fat_raw_buffer >>, %offset_i: index , %offset_j: index ) {
109
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
110
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
111
+ // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG1]], %[[ARG2]]] by (64, 128) : index
112
+ // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[IDXM]]], %[[LOCAL]][%[[C0]]]
113
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
114
+
115
+ %alloc = memref.alloc () : memref <4096 xf16 , #gpu_lds_addrspace >
116
+ %expand_mem = memref.expand_shape %mem [[0 , 1 ]] output_shape [64 , 128 ] : memref <8192 xf16 , #amdgpu.address_space <fat_raw_buffer >> into memref <64 x128 xf16 , #amdgpu.address_space <fat_raw_buffer >>
117
+
118
+ %c0 = arith.constant 0 : index
119
+ amdgpu.gather_to_lds %expand_mem [%offset_i , %offset_j ], %alloc [%c0 ]
120
+ : vector <8 xf16 >, memref <64 x128 xf16 , #amdgpu.address_space <fat_raw_buffer >>, memref <4096 xf16 , #gpu_lds_addrspace >
121
+ func.return
122
+ }
123
+
124
+ // -----
125
+
126
+ #gpu_lds_addrspace = 3
127
+
128
+ // CHECK: func @test_expand_shape_dst_only
129
+ // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
130
+ func.func @test_expand_shape_dst_only (%offset_i: index , %offset_j: index ) {
131
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
132
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
133
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
134
+ // CHECK: %[[IDX_LDS:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (64, 64) : index
135
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]]], %[[LOCAL]][%[[IDX_LDS]]]
136
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
137
+
138
+ %alloc = memref.alloc () : memref <4096 xf16 , #gpu_lds_addrspace >
139
+ %mem = memref.alloc () : memref <8192 xf16 >
140
+ %expand_alloc = memref.expand_shape %alloc [[0 , 1 ]] output_shape [64 , 64 ] : memref <4096 xf16 , #gpu_lds_addrspace > into memref <64 x64 xf16 , #gpu_lds_addrspace >
141
+
142
+ %c0 = arith.constant 0 : index
143
+ amdgpu.gather_to_lds %mem [%offset_i ], %expand_alloc [%offset_j , %c0 ]
92
144
: vector <8 xf16 >, memref <8192 xf16 >, memref <64 x64 xf16 , #gpu_lds_addrspace >
93
145
func.return
94
146
}
147
+
148
+ // -----
149
+
150
+ #gpu_lds_addrspace = 3
151
+
152
+ // CHECK: func @test_nop
153
+ // CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
154
+ func.func @test_nop (%mem : memref <8192 xf16 , #amdgpu.address_space <fat_raw_buffer >>, %offset_i: index , %offset_j: index ) {
155
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
156
+ // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[ARG1]]], %[[LOCAL]][%[[ARG2]]]
157
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
158
+
159
+ %alloc = memref.alloc () : memref <4096 xf16 , #gpu_lds_addrspace >
160
+ amdgpu.gather_to_lds %mem [%offset_i ], %alloc [%offset_j ]
161
+ : vector <8 xf16 >, memref <8192 xf16 , #amdgpu.address_space <fat_raw_buffer >>, memref <4096 xf16 , #gpu_lds_addrspace >
162
+ func.return
163
+ }
0 commit comments