-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][NFC] Pre-commit test for redundant s_cmp_lg_* sX, 0 removal #162351
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Signed-off-by: John Lu <[email protected]>
@llvm/pr-subscribers-backend-amdgpu Author: None (LU-JOHN) ChangesPre-commit test for redundant s_cmp_lg_* sX, 0 removal. Patch is 20.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162351.diff 1 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
new file mode 100644
index 0000000000000..8dc846c862200
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -0,0 +1,585 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
+declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
+
+define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: shl32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = shl i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: shl64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = shl i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: lshr32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshr_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = lshr i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: lshr64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = lshr i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: and32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: and64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: or32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = or i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: or64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = or i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: xor32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_xor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: xor64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: nand32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_nand_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i32 %val0, %val1
+ %result2 = xor i32 %result, -1
+ store i32 %result2, ptr addrspace(1) %ptr
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: nand64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i64 %val0, %val1
+ %result2 = xor i64 %result, -1
+ store i64 %result2, ptr addrspace(1) %ptr
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: nor32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_nor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = or i32 %val0, %val1
+ %result2 = xor i32 %result, -1
+ store i32 %result2, ptr addrspace(1) %ptr
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: nor64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = or i64 %val0, %val1
+ %result2 = xor i64 %result, -1
+ store i64 %result2, ptr addrspace(1) %ptr
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: xnor32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_xnor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i32 %val0, %val1
+ %result2 = xor i32 %result, -1
+ store i32 %result2, ptr addrspace(1) %ptr
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: xnor64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i64 %val0, %val1
+ %result2 = xor i64 %result, -1
+ store i64 %result2, ptr addrspace(1) %ptr
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: andn232:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_andn2_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %nval1 = xor i32 %val1, -1
+ %result = and i32 %val0, %nval1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: nandn264:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %nval1 = xor i64 %val1, -1
+ %result = and i64 %val0, %nval1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: orn232:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_orn2_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %nval1 = xor i32 %val1, -1
+ %result = or i32 %val0, %nval1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: orn264:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %nval1 = xor i64 %val1, -1
+ %result = or i64 %val0, %nval1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
+; CHECK-LABEL: bfe_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = shl i32 %val0, 8
+ %result = ashr i32 %shl, 24
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_i64(i64 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: bfe_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x80000
+; CHECK-NEXT: s_and_b32 s0, s0, 0xff
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = shl i64 %val0, 56
+ %result = ashr i64 %shl, 56
+ store i64 %result, ptr addrspace(1) %ptr
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
+; CHECK-LABEL: bfe_u32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = shl i32 %val0, 8
+ %result = lshr i32 %shl, 24
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_u64(i64 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: bfe_u64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, 0xff
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = shl i64 %val0, 56
+ %result = lshr i64 %shl, 56
+ store i64 %result, ptr addrspace(1) %ptr
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt032(i32 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: bcnt032:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
+; CHECK-NEXT: s_sub_i32 s0, 32, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ %result2 = sub i32 32, %result
+ store i32 %result2, ptr addrspace(1) %ptr
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt064(i64 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: bcnt064:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_sub_u32 s0, 64, s0
+; CHECK-NEXT: s_subb_u32 s1, 0, 0
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ %result2 = sub i64 64, %result
+ store i64 %result2, ptr addrspace(1) %ptr
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt132(i32 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: bcnt132:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ store i32 %result, ptr addrspace(1) %ptr
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt164(i64 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: bcnt164:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ store i64 %result, ptr addrspace(1) %ptr
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @quadmask32(i32 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: quadmask32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_quadmask_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val0) nounwind readnone
+ store i32 %result, ptr addrspace(1) %ptr
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @quadmask64(i64 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: quadmask64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val0) nounwind readnone
+ store i64 %result, ptr addrspace(1) %ptr
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @not32(i32 inreg %val0, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: not32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_not_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i32 %val0, -1
+ store i32 %result, ptr addrspace(1) %ptr
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext...
[truncated]
|
Deleting redundant s_cmp_lg* instruction done in #162352. |
; CHECK-NEXT: ; return to shader part epilog | ||
%result = and i32 %val0, %val1 | ||
%result2 = xor i32 %result, -1 | ||
store i32 %result2, ptr addrspace(1) %ptr |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using VALU stores for tests for scalar handling is suspicious, are these just for adding an extra use?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using VALU stores for tests for scalar handling is suspicious, are these just for adding an extra use?
The stores are done to ensure the compiler cannot optimize the operation into another instruction.
Signed-off-by: John Lu <[email protected]>
Pre-commit test for redundant s_cmp_lg_* sX, 0 removal.