@@ -96,3 +96,68 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
9696 : vector <8 xf16 >, memref <8192 xf16 >, memref <4096 xf16 , #gpu_lds_addrspace >
9797 func.return
9898}
99+
100+
101+ // -----
102+
103+ #gpu_lds_addrspace = 3
104+
105+
106+ // CHECK: func @test_expand_shape_src_raw_buffer
107+ // CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
108+ func.func @test_expand_shape_src_raw_buffer (%mem : memref <8192 xf16 , #amdgpu.address_space <fat_raw_buffer >>, %offset_i: index , %offset_j: index ) {
109+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
110+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
111+ // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG1]], %[[ARG2]]] by (64, 128) : index
112+ // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[IDXM]]], %[[LOCAL]][%[[C0]]]
113+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
114+
115+ %alloc = memref.alloc () : memref <4096 xf16 , #gpu_lds_addrspace >
116+ %expand_mem = memref.expand_shape %mem [[0 , 1 ]] output_shape [64 , 128 ] : memref <8192 xf16 , #amdgpu.address_space <fat_raw_buffer >> into memref <64 x128 xf16 , #amdgpu.address_space <fat_raw_buffer >>
117+
118+ %c0 = arith.constant 0 : index
119+ amdgpu.gather_to_lds %expand_mem [%offset_i , %offset_j ], %alloc [%c0 ]
120+ : vector <8 xf16 >, memref <64 x128 xf16 , #amdgpu.address_space <fat_raw_buffer >>, memref <4096 xf16 , #gpu_lds_addrspace >
121+ func.return
122+ }
123+
124+ // -----
125+
126+ #gpu_lds_addrspace = 3
127+
128+ // CHECK: func @test_expand_shape_dst_only
129+ // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
130+ func.func @test_expand_shape_dst_only (%offset_i: index , %offset_j: index ) {
131+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
132+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
133+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
134+ // CHECK: %[[IDX_LDS:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (64, 64) : index
135+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]]], %[[LOCAL]][%[[IDX_LDS]]]
136+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
137+
138+ %alloc = memref.alloc () : memref <4096 xf16 , #gpu_lds_addrspace >
139+ %mem = memref.alloc () : memref <8192 xf16 >
140+ %expand_alloc = memref.expand_shape %alloc [[0 , 1 ]] output_shape [64 , 64 ] : memref <4096 xf16 , #gpu_lds_addrspace > into memref <64 x64 xf16 , #gpu_lds_addrspace >
141+
142+ %c0 = arith.constant 0 : index
143+ amdgpu.gather_to_lds %mem [%offset_i ], %expand_alloc [%offset_j , %c0 ]
144+ : vector <8 xf16 >, memref <8192 xf16 >, memref <64 x64 xf16 , #gpu_lds_addrspace >
145+ func.return
146+ }
147+
148+ // -----
149+
150+ #gpu_lds_addrspace = 3
151+
152+ // CHECK: func @test_nop
153+ // CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
154+ func.func @test_nop (%mem : memref <8192 xf16 , #amdgpu.address_space <fat_raw_buffer >>, %offset_i: index , %offset_j: index ) {
155+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
156+ // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[ARG1]]], %[[LOCAL]][%[[ARG2]]]
157+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
158+
159+ %alloc = memref.alloc () : memref <4096 xf16 , #gpu_lds_addrspace >
160+ amdgpu.gather_to_lds %mem [%offset_i ], %alloc [%offset_j ]
161+ : vector <8 xf16 >, memref <8192 xf16 , #amdgpu.address_space <fat_raw_buffer >>, memref <4096 xf16 , #gpu_lds_addrspace >
162+ func.return
163+ }
0 commit comments