Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 4 additions & 13 deletions llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
const UniformityInfo &UI,
ValueMap<const Value *, bool> &Tracker) {
llvm::Intrinsic::ID IID = II.getIntrinsicID();

/// We deliberately do not simplify readfirstlane with a uniform argument, so
/// that frontends can use it to force a copy to SGPR and thereby prevent the
/// backend from generating unwanted waterfall loops.
switch (IID) {
case Intrinsic::amdgcn_permlane64:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest adding a comment here saying that we deliberately do not simplify readfirstlane with a uniform argument, so that frontends can use it to force a copy to SGPR and thereby prevent the backend from generating unwanted waterfall loops.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, will do that.

case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
Expand Down Expand Up @@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return Changed;
}
default:
llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
return false;
}
return false;
}
Expand All @@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
auto *II = dyn_cast<IntrinsicInst>(&I);
if (!II)
continue;

switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_ballot:
break;
default:
continue;
}
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
return IsChanged;
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0)
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, [[FIRST_ACTIVE_ID]]
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
; PASS-CHECK: [[WORK]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
Expand Down Expand Up @@ -308,7 +309,8 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[MYMASK]])
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[FIRST_ACTIVE_ID]]
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
; PASS-CHECK: [[WORK]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
Expand Down
32 changes: 22 additions & 10 deletions llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -248,12 +248,14 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
Expand All @@ -269,12 +271,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
Expand Down Expand Up @@ -360,12 +364,16 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
Expand All @@ -388,15 +396,17 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -537,13 +547,15 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%random = xor i32 123, 456
Expand Down
18 changes: 14 additions & 4 deletions llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,24 @@
define amdgpu_gs i32 @main() {
; CHECK-LABEL: main:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_bitcmp1_b32 0, 0
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_or_saveexec_b32 s2, -1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_readfirstlane_b32 s1, v0
; CHECK-NEXT: s_mov_b32 exec_lo, s2
; CHECK-NEXT: s_or_b32 s0, s0, s1
; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_xor_b32 s0, s0, -1
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
; CHECK-NEXT: s_wait_alu 0xf1ff
; CHECK-NEXT: ; return to shader part epilog
bb:
Expand Down
48 changes: 26 additions & 22 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,8 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
;
; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 0x4040000000000000
; CHECK-GISEL-NEXT: s_mov_b32 s0, 0
; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; use s[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -455,13 +456,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
Expand All @@ -488,13 +490,15 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
Expand Down Expand Up @@ -584,17 +588,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
Expand Down Expand Up @@ -624,17 +628,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
Expand Down
23 changes: 14 additions & 9 deletions llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,33 +20,38 @@ define void @test() {
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: .LBB0_3: ; %bb.3
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_readfirstlane_b32 s6, v0
; CHECK-NEXT: s_mov_b64 s[4:5], -1
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: s_cmp_eq_u32 s6, s7
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: s_mov_b64 s[10:11], exec
; CHECK-NEXT: s_mov_b64 exec, -1
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: s_cbranch_scc1 .LBB0_5
; CHECK-NEXT: ; %bb.4: ; %bb.4
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: .LBB0_5: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
Expand Down
Loading
Loading