Skip to content
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,9 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
} // End isReMaterializable = 1, isAsCheapAsAMove = 1

let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]
[(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop_oneuse i32:$src0)))]

Will need to put in AMDGPUInstructions.td

def ctpop_oneuse : HasOneUseUnaryOp<ctpop>;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

>;
def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
Expand Down Expand Up @@ -1885,6 +1887,13 @@ def : GCNPat <
(S_MOV_B32 (i32 0)), sub1))
>;

def : GCNPat <
(i64 (UniformBinFrag<sub> 64, (ctpop i64:$src))),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0,
(S_MOV_B32 (i32 0)), sub1))
>;

def : GCNPat <
(i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))),
(S_ABS_I32 SReg_32:$x)
Expand Down
85 changes: 85 additions & 0 deletions llvm/test/CodeGen/AMDGPU/s_bcnt0.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a GlobalISel run line to check that the patterns work there too?

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jayfoad, the negative tests crash Global ISel, so I can't add a check unless I break out the positive tests to a separate file. I'll do that if you like, but I think a better approach would be to file a JIRA issue to look into that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised anything here breaks globalisel. I'd expect this of all things to work better. What is the crash?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arsenm

LLVM ERROR: unable to map instruction: %8:sreg_32 = COPY %7:vgpr(s32) (in function: bcnt032_not_for_vregs)

It requires the test WITHOUT your suggested changes to cause the crash. I updated them per your comments, but please let me know if you'd like me to revert that commit to preserve the crashing testcase. (I can add an XFAIL line.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is failing on the broken v-to-s copy for the asm input. I think this is the global isel flavored version of the case the DAG path miscompiles into ignoring the SGPR constraint


define i32 @bcnt032_not_for_vregs(i32 %val0) {
; CHECK-LABEL: bcnt032_not_for_vregs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_bcnt_u32_b32 v0, v0, 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = call i32 @llvm.ctpop.i32(i32 %val0)
%result2 = sub i32 32, %result
%cmp = icmp ne i32 %result2, 0
%zext = zext i1 %cmp to i32
ret i32 %zext
}

define i32 @bcnt064_not_for_vregs(i64 %val0) {
; CHECK-LABEL: bcnt064_not_for_vregs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_bcnt_u32_b32 v0, v0, 0
; CHECK-NEXT: v_bcnt_u32_b32 v0, v1, v0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 64, v[0:1]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.ctpop.i64(i64 %val0)
%result2 = sub i64 64, %result
%cmp = icmp ne i64 %result2, 0
%zext = zext i1 %cmp to i32
ret i32 %zext
}

define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
; CHECK-LABEL: bcnt032_ctpop_multiple_uses:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0
; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
Comment on lines +39 to +40
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we should do this in the multiple use case. It's not worse, but it's trading for an equivalently good instruction

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arsenm I thought so, too, but they're actually not equivalent because of SCC. The multiple use transformation has the potential to get rid of an unnecessary compare instruction because s_bcnt0 will set SCC to 1 when the result is nonzero. Here's the master versus branch diff for bcnt032_ctpop_multiple_uses:

@@ -105,14 +105,13 @@
 	.type	bcnt032_ctpop_multiple_uses,@function
 bcnt032_ctpop_multiple_uses:            ; @bcnt032_ctpop_multiple_uses
 ; %bb.0:
-	s_bcnt1_i32_b32 s0, s0
-	s_sub_i32 s1, 32, s0
-	s_cmp_lg_u32 s1, 0
+	s_bcnt1_i32_b32 s1, s0
+	s_bcnt0_i32_b32 s0, s0
 	;;#ASMSTART
-	; use s0
+	; use s1
 	;;#ASMEND
 	;;#ASMSTART
-	; use s1
+	; use s0
 	;;#ASMEND
 	s_cselect_b64 s[0:1], -1, 0
 	v_cndmask_b32_e64 v0, 0, 1, s[0:1]

bcnt064_ctpop_multiple_uses does not demonstrate an improvement versus master in instruction count, but I think it is possible to save a compare instruction there as well sometimes.

; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: ; return to shader part epilog
%result = call i32 @llvm.ctpop.i32(i32 %val0)
%result2 = sub i32 32, %result
call void asm "; use $0", "s"(i32 %result)
call void asm "; use $0", "s"(i32 %result2)
%cmp = icmp ne i32 %result2, 0
%zext = zext i1 %cmp to i32
ret i32 %zext
}

define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) {
; CHECK-LABEL: bcnt064_ctpop_multiple_uses:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s3, 0
; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1]
; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_mov_b32 s1, s3
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ctpop.i64(i64 %val0)
%result2 = sub i64 64, %result
call void asm "; use $0", "s"(i64 %result)
call void asm "; use $0", "s"(i64 %result2)
%cmp = icmp ne i64 %result2, 0
%zext = zext i1 %cmp to i32
ret i32 %zext
}
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
Original file line number Diff line number Diff line change
Expand Up @@ -459,9 +459,7 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
; CHECK-LABEL: bcnt032:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
; CHECK-NEXT: s_sub_i32 s0, 32, s0
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
Expand All @@ -480,9 +478,8 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
; CHECK-LABEL: bcnt064:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_sub_u32 s0, 64, s0
; CHECK-NEXT: s_subb_u32 s1, 0, 0
; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
Expand Down