@@ -294,6 +294,56 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset(ptr %out) {
294
294
ret void
295
295
}
296
296
297
+ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index (ptr %out ) {
298
+ ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(
299
+ ; CHECK-SAME: ptr [[OUT:%.*]]) {
300
+ ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
301
+ ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
302
+ ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
303
+ ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
304
+ ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
305
+ ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
306
+ ; CHECK-NEXT: [[SEL3:%.*]] = zext i32 [[SEL2]] to i64
307
+ ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i64> poison
308
+ ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <6 x i64> [[ALLOCA]], i64 0, i32 0
309
+ ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <6 x i64> [[TMP11]], i64 1, i32 1
310
+ ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 2, i32 2
311
+ ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 3, i32 3
312
+ ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
313
+ ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
314
+ ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3
315
+ ; CHECK-NEXT: [[TMP2:%.*]] = add i64 6, [[TMP1]]
316
+ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
317
+ ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
318
+ ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1
319
+ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]]
320
+ ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1
321
+ ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 2
322
+ ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]]
323
+ ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2
324
+ ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2
325
+ ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8
326
+ ; CHECK-NEXT: ret void
327
+ ;
328
+ %x = tail call i32 @llvm.amdgcn.workitem.id.x ()
329
+ %y = tail call i32 @llvm.amdgcn.workitem.id.y ()
330
+ %c1 = icmp uge i32 %x , 3
331
+ %c2 = icmp uge i32 %y , 3
332
+ %sel1 = select i1 %c1 , i32 1 , i32 2
333
+ %sel2 = select i1 %c2 , i32 0 , i32 %sel1
334
+ %sel3 = zext i32 %sel2 to i64
335
+ %alloca = alloca [2 x [3 x i64 ]], align 16 , addrspace (5 )
336
+ %gep.00 = getelementptr inbounds [2 x [3 x i64 ]], ptr addrspace (5 ) %alloca , i32 0
337
+ %gep.01 = getelementptr inbounds [2 x [3 x i64 ]], ptr addrspace (5 ) %alloca , i32 0 , i32 1 , i32 0
338
+ store <3 x i64 > <i64 0 , i64 1 , i64 2 >, ptr addrspace (5 ) %gep.00
339
+ store <3 x i64 > <i64 3 , i64 4 , i64 5 >, ptr addrspace (5 ) %gep.01
340
+ %gep = getelementptr inbounds [2 x [3 x i64 ]], ptr addrspace (5 ) %alloca , i64 1 , i64 %sel3
341
+ %load = load <3 x i64 >, ptr addrspace (5 ) %gep
342
+ %elem = extractelement <3 x i64 > %load , i32 2
343
+ store i64 %elem , ptr %out
344
+ ret void
345
+ }
346
+
297
347
define amdgpu_kernel void @i64_2d_load_store_subvec_4 (ptr %out ) {
298
348
; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_4(
299
349
; CHECK-SAME: ptr [[OUT:%.*]]) {
0 commit comments