-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Stop optimising readfirstlane in pass AMDGPUUniformIntrinsicCombine #166955
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Stop optimising readfirstlane in pass AMDGPUUniformIntrinsicCombine #166955
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Pankaj Dwivedi (PankajDwivedi-25) ChangesI am not very clear about the cause of the issue #166665, but this could be a temporary fix instead of disabling this pass in the pipeline. Patch is 59.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166955.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 65e6ed9d1d428..9fe2ae9468639 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -60,7 +60,6 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
switch (IID) {
case Intrinsic::amdgcn_permlane64:
- case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
@@ -124,7 +123,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane64:
- case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_ballot:
break;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index c962c05d24ad0..5d79696572cf0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -239,7 +239,8 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
-; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
+; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0)
+; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, [[FIRST_ACTIVE_ID]]
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
; PASS-CHECK: [[WORK]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -308,7 +309,8 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
-; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
+; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[MYMASK]])
+; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[FIRST_ACTIVE_ID]]
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
; PASS-CHECK: [[WORK]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index a7e828c95d69f..402ccd91fed8d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -248,12 +248,14 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
@@ -269,12 +271,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
@@ -360,12 +364,16 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
+; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
@@ -388,7 +396,8 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
@@ -396,7 +405,8 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -537,13 +547,15 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
-; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
-; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%random = xor i32 123, 456
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
index b8f084d5f82ad..db32135939a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
@@ -4,14 +4,24 @@
define amdgpu_gs i32 @main() {
; CHECK-LABEL: main:
; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_bitcmp1_b32 0, 0
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: s_cselect_b32 s1, -1, 0
+; CHECK-NEXT: s_or_saveexec_b32 s2, -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s1, v0
+; CHECK-NEXT: s_mov_b32 exec_lo, s2
+; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_xor_b32 s0, s0, -1
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v1
; CHECK-NEXT: s_wait_alu 0xf1ff
; CHECK-NEXT: ; return to shader part epilog
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 02d29909c661c..d1ba892d7f7e1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -396,7 +396,8 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
;
; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 0x4040000000000000
+; CHECK-GISEL-NEXT: s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; use s[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
@@ -455,13 +456,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -488,13 +490,15 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -584,17 +588,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
@@ -624,17 +628,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index ef96944abef0e..586579fcaeb93 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -20,33 +20,38 @@ define void @test() {
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: .LBB0_3: ; %bb.3
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: ; implicit-def: $sgpr4
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_readfirstlane_b32 s6, v0
; CHECK-NEXT: s_mov_b64 s[4:5], -1
+; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: s_cmp_eq_u32 s6, s7
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_mov_b64 s[10:11], exec
+; CHECK-NEXT: s_mov_b64 exec, -1
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[8:9]
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: s_cbranch_scc1 .LBB0_5
; CHECK-NEXT: ; %bb.4: ; %bb.4
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[8:9]
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[8:9]
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: .LBB0_5: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[8:9]
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 364598f7cf6c0..5aafb0f576fb4 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: KILL undef %117:sgpr_128
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: KILL undef %125:sgpr_128
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -44,85 +44,87 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4)
; CHECK-NEXT: KILL undef %74:sreg_64
; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
|
@raiseirql please could you test this? |
| switch (II->getIntrinsicID()) { | ||
| case Intrinsic::amdgcn_permlane64: | ||
| case Intrinsic::amdgcn_readfirstlane: | ||
| case Intrinsic::amdgcn_readlane: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pre-existing: it is silly to repeat this list of cases here. You should just change optimizeUniformIntrinsic so that it returns false for unhandled intrinsics.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The intention was to avoid unnecessarily calling the function; I will drop it.
@jayfoad @PankajDwivedi-25 I merged this change to our branch and yes, the codegen looks good and our use case runs without crashes/errors. |
|
Great, I will merge this PR. |
|
@jayfoad can you please approve this if there is nothing else. |
| llvm::Intrinsic::ID IID = II.getIntrinsicID(); | ||
|
|
||
| switch (IID) { | ||
| case Intrinsic::amdgcn_permlane64: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggest adding a comment here saying that we deliberately do not simplify readfirstlane with a uniform argument, so that frontends can use it to force a copy to SGPR and thereby prevent the backend from generating unwanted waterfall loops.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, will do that.
jayfoad
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nit
I am not very clear about the cause of the issue #166665, but this could be a temporary fix instead of disabling this pass in the pipeline.