@@ -198,6 +198,48 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3(ptr %out) {
198198 ret void
199199}
200200
201+ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset (ptr %out ) {
202+ ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset(
203+ ; CHECK-SAME: ptr [[OUT:%.*]]) {
204+ ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
205+ ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
206+ ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
207+ ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
208+ ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
209+ ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
210+ ; CHECK-NEXT: [[SEL3:%.*]] = zext i32 [[SEL2]] to i64
211+ ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3
212+ ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i64 [[TMP1]]
213+ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
214+ ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], 1
215+ ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i64 [[TMP4]]
216+ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
217+ ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], 2
218+ ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i64 [[TMP7]]
219+ ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
220+ ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
221+ ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8
222+ ; CHECK-NEXT: ret void
223+ ;
224+ %x = tail call i32 @llvm.amdgcn.workitem.id.x ()
225+ %y = tail call i32 @llvm.amdgcn.workitem.id.y ()
226+ %c1 = icmp uge i32 %x , 3
227+ %c2 = icmp uge i32 %y , 3
228+ %sel1 = select i1 %c1 , i32 1 , i32 2
229+ %sel2 = select i1 %c2 , i32 0 , i32 %sel1
230+ %sel3 = zext i32 %sel2 to i64
231+ %alloca = alloca [2 x [3 x i64 ]], align 16 , addrspace (5 )
232+ %gep.00 = getelementptr inbounds [2 x [3 x i64 ]], ptr addrspace (5 ) %alloca , i32 0
233+ %gep.01 = getelementptr inbounds [2 x [3 x i64 ]], ptr addrspace (5 ) %alloca , i32 0 , i32 1 , i32 0
234+ store <3 x i64 > <i64 0 , i64 1 , i64 2 >, ptr addrspace (5 ) %gep.00
235+ store <3 x i64 > <i64 3 , i64 4 , i64 5 >, ptr addrspace (5 ) %gep.01
236+ %gep = getelementptr inbounds [2 x [3 x i64 ]], ptr addrspace (5 ) %alloca , i64 0 , i64 %sel3
237+ %load = load <3 x i64 >, ptr addrspace (5 ) %gep
238+ %elem = extractelement <3 x i64 > %load , i32 2
239+ store i64 %elem , ptr %out
240+ ret void
241+ }
242+
201243define amdgpu_kernel void @i64_2d_load_store_subvec_4 (ptr %out ) {
202244; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_4(
203245; CHECK-SAME: ptr [[OUT:%.*]]) {
0 commit comments