@@ -33,6 +33,7 @@ __gpu_kernel void foo() {
3333 __gpu_lane_id ();
3434 __gpu_lane_mask ();
3535 __gpu_read_first_lane_u32 (-1 , -1 );
36+ __gpu_read_first_lane_u64 (-1 , -1 );
3637 __gpu_ballot (-1 , 1 );
3738 __gpu_sync_threads ();
3839 __gpu_sync_lane (-1 );
@@ -64,12 +65,13 @@ __gpu_kernel void foo() {
6465// AMDGPU-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
6566// AMDGPU-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
6667// AMDGPU-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
67- // AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
68+ // AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR7]]
69+ // AMDGPU-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
6870// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
6971// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
70- // AMDGPU-NEXT: [[CALL21 :%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
71- // AMDGPU-NEXT: [[CALL22 :%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
72- // AMDGPU-NEXT: [[CALL23 :%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
72+ // AMDGPU-NEXT: [[CALL22 :%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
73+ // AMDGPU-NEXT: [[CALL23 :%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
74+ // AMDGPU-NEXT: [[CALL24 :%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
7375// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
7476// AMDGPU-NEXT: unreachable
7577//
@@ -388,6 +390,43 @@ __gpu_kernel void foo() {
388390// AMDGPU-NEXT: ret i32 [[TMP1]]
389391//
390392//
393+ // AMDGPU-LABEL: define internal i64 @__gpu_read_first_lane_u64(
394+ // AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
395+ // AMDGPU-NEXT: [[ENTRY:.*:]]
396+ // AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
397+ // AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
398+ // AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
399+ // AMDGPU-NEXT: [[__HI:%.*]] = alloca i32, align 4, addrspace(5)
400+ // AMDGPU-NEXT: [[__LO:%.*]] = alloca i32, align 4, addrspace(5)
401+ // AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
402+ // AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
403+ // AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
404+ // AMDGPU-NEXT: [[__HI_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__HI]] to ptr
405+ // AMDGPU-NEXT: [[__LO_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LO]] to ptr
406+ // AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
407+ // AMDGPU-NEXT: store i64 [[__X]], ptr [[__X_ADDR_ASCAST]], align 8
408+ // AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
409+ // AMDGPU-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
410+ // AMDGPU-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
411+ // AMDGPU-NEXT: store i32 [[CONV]], ptr [[__HI_ASCAST]], align 4
412+ // AMDGPU-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
413+ // AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
414+ // AMDGPU-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
415+ // AMDGPU-NEXT: store i32 [[CONV1]], ptr [[__LO_ASCAST]], align 4
416+ // AMDGPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
417+ // AMDGPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI_ASCAST]], align 4
418+ // AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR7]]
419+ // AMDGPU-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
420+ // AMDGPU-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
421+ // AMDGPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
422+ // AMDGPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO_ASCAST]], align 4
423+ // AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR7]]
424+ // AMDGPU-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
425+ // AMDGPU-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
426+ // AMDGPU-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
427+ // AMDGPU-NEXT: ret i64 [[OR]]
428+ //
429+ //
391430// AMDGPU-LABEL: define internal i64 @__gpu_ballot(
392431// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
393432// AMDGPU-NEXT: [[ENTRY:.*:]]
@@ -525,12 +564,13 @@ __gpu_kernel void foo() {
525564// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
526565// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
527566// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
528- // NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
567+ // NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR6]]
568+ // NVPTX-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
529569// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
530570// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
531- // NVPTX-NEXT: [[CALL21 :%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
532- // NVPTX-NEXT: [[CALL22 :%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
533- // NVPTX-NEXT: [[CALL23 :%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
571+ // NVPTX-NEXT: [[CALL22 :%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
572+ // NVPTX-NEXT: [[CALL23 :%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
573+ // NVPTX-NEXT: [[CALL24 :%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
534574// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
535575// NVPTX-NEXT: unreachable
536576//
@@ -793,6 +833,37 @@ __gpu_kernel void foo() {
793833// NVPTX-NEXT: ret i32 [[TMP7]]
794834//
795835//
836+ // NVPTX-LABEL: define internal i64 @__gpu_read_first_lane_u64(
837+ // NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
838+ // NVPTX-NEXT: [[ENTRY:.*:]]
839+ // NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
840+ // NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8
841+ // NVPTX-NEXT: [[__HI:%.*]] = alloca i32, align 4
842+ // NVPTX-NEXT: [[__LO:%.*]] = alloca i32, align 4
843+ // NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
844+ // NVPTX-NEXT: store i64 [[__X]], ptr [[__X_ADDR]], align 8
845+ // NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
846+ // NVPTX-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
847+ // NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
848+ // NVPTX-NEXT: store i32 [[CONV]], ptr [[__HI]], align 4
849+ // NVPTX-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
850+ // NVPTX-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
851+ // NVPTX-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
852+ // NVPTX-NEXT: store i32 [[CONV1]], ptr [[__LO]], align 4
853+ // NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
854+ // NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
855+ // NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR6]]
856+ // NVPTX-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
857+ // NVPTX-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
858+ // NVPTX-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
859+ // NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
860+ // NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR6]]
861+ // NVPTX-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
862+ // NVPTX-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
863+ // NVPTX-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
864+ // NVPTX-NEXT: ret i64 [[OR]]
865+ //
866+ //
796867// NVPTX-LABEL: define internal i64 @__gpu_ballot(
797868// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
798869// NVPTX-NEXT: [[ENTRY:.*:]]
0 commit comments