From 96825875e1c8bad9a9b3b28fe943a5469f94c575 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Thu, 23 Oct 2025 11:50:32 -0500 Subject: [PATCH 01/13] Initial work --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++ .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 43 +++++++++++++++++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 +++- 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 36cb527a9c806..053f2f1c7d19e 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc") + TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 8e35109061792..a6ae156b601b1 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; +def int_amdgcn_bcnt032_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + +def int_amdgcn_bcnt064_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + // llvm.amdgcn.ds.swizzle src offset def int_amdgcn_ds_swizzle : ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba77d69aa..39b558694edf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -35,6 +36,7 @@ #include "llvm/Support/KnownFPClass.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" +#include #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -93,6 +95,13 @@ static cl::opt DisableFDivExpand( cl::ReallyHidden, cl::init(false)); +// Disable processing of fdiv so we can better test the backend implementations. +static cl::opt + DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", + cl::desc("Prevent transforming bitsin(typeof(x)) - " + "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, cl::init(false)); + class AMDGPUCodeGenPrepareImpl : public InstVisitor { public: @@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); + bool visitCtpop(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::ctpop: + return visitCtpop(I); default: return false; } @@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, return insertValues(Builder, FractArg->getType(), ResultVals); } +bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { + uint32_t BitWidth, DestinationWidth, IntrinsicWidth; + if (!I.hasOneUse() || + !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) + return false; + + BinaryOperator *MustBeSub = dyn_cast(I.user_back()); + if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub) + return false; + + ConstantInt *FirstOperand = dyn_cast(MustBeSub->getOperand(0)); + if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth) + return false; + + IRBuilder<> Builder(MustBeSub); + Instruction *TransformedIns = + Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo + : Intrinsic::amdgcn_bcnt032_lo, + {}, {I.getArgOperand(0)}); + + if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != + (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth())) + TransformedIns = cast(Builder.CreateZExtOrTrunc( + TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); + + MustBeSub->replaceAllUsesWith(TransformedIns); + TransformedIns->takeName(MustBeSub); + MustBeSub->eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *FractArg = matchFractPat(I); if (!FractArg) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 1931e0be15152..4c23f3723d019 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", } // End isReMaterializable = 1, isAsCheapAsAMove = 1 let Defs = [SCC] in { -def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; -def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; +def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", + [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] +>; +def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", + [(set i32:$sdst, (UniformUnaryFrag i64:$src0))] +>; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] >; From 5d475c6d9671811d4eacd6ab7f5e9d4eeb8ba49c Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Thu, 23 Oct 2025 14:20:42 -0500 Subject: [PATCH 02/13] Update testcases --- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 38 +++++++++++++---------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 0166d7ac7ddc2..7669eae972e49 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -459,16 +459,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) { define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { ; CHECK-LABEL: bcnt032: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 -; CHECK-NEXT: s_sub_i32 s0, 32, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result2) @@ -480,17 +478,15 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { ; CHECK-LABEL: bcnt064: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_sub_u32 s0, 64, s0 -; CHECK-NEXT: s_subb_u32 s1, 0, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result2) From 1396fb7b53ec02af954fab64635efa41e0576e76 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Thu, 23 Oct 2025 17:31:31 -0500 Subject: [PATCH 03/13] Don't perform optimization on vector types --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 39b558694edf8..8f13fa79d3637 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1991,7 +1991,7 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { uint32_t BitWidth, DestinationWidth, IntrinsicWidth; - if (!I.hasOneUse() || + if (!I.hasOneUse() || !I.getType()->isIntegerTy() || !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) return false; From 11918fd0b9e6e2fc42283e8c1eba95e79b8c68f0 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Thu, 23 Oct 2025 18:29:17 -0500 Subject: [PATCH 04/13] Review changes --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++-- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++---- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 6 +++--- llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 053f2f1c7d19e..ce53c6e7b0537 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,8 +63,8 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc") TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a6ae156b601b1..5fe1b6672d576 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,12 +2359,12 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; -def int_amdgcn_bcnt032_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">, +def int_amdgcn_bcnt32_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; -def int_amdgcn_bcnt064_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">, +def int_amdgcn_bcnt64_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; // llvm.amdgcn.ds.swizzle src offset diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8f13fa79d3637..169541d9d45f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -95,7 +95,7 @@ static cl::opt DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -// Disable processing of fdiv so we can better test the backend implementations. +// Disable bitsin(typeof(x)) - popcnt(x) to s_bcnt0(x) transformation. static cl::opt DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", cl::desc("Prevent transforming bitsin(typeof(x)) - " @@ -2005,8 +2005,8 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { IRBuilder<> Builder(MustBeSub); Instruction *TransformedIns = - Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo - : Intrinsic::amdgcn_bcnt032_lo, + Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt64_lo + : Intrinsic::amdgcn_bcnt32_lo, {}, {I.getArgOperand(0)}); if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 4c23f3723d019..e034b1f5333af 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -265,10 +265,10 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", - [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] >; def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", - [(set i32:$sdst, (UniformUnaryFrag i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag i64:$src0))] >; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] From db5c6f9347912e13c7b5208bb4455f0f14856fed Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 24 Oct 2025 14:12:23 -0500 Subject: [PATCH 05/13] Review changes: - Add tests - Remove builtin (users will need inline assembly if pattern match fails) --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 - llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 - llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 109 +++++++++++++++++++ 3 files changed, 109 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index ce53c6e7b0537..36cb527a9c806 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,9 +63,6 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc") - TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 5fe1b6672d576..34e23c9c07f3a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2360,11 +2360,9 @@ def int_amdgcn_mbcnt_hi : [IntrNoMem]>; def int_amdgcn_bcnt32_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_amdgcn_bcnt64_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; // llvm.amdgcn.ds.swizzle src offset diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 7669eae972e49..1a6f32ef7e617 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -636,3 +636,112 @@ if: endif: ret i32 1 } + +define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt032_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dword v2, v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v3 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dword v[0:1], v2, off +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i32, ptr addrspace(1) %gep + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt064_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: b32 s0, s0, 2 +; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc +; CHECK-NEXT: nt vmcnt(0) +; CHECK-NEXT: 32_e32 v4, 0 +; CHECK-NEXT: u32_b32 v2, v2, 0 +; CHECK-NEXT: u32_b32 v3, v3, v2 +; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 +; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc +; CHECK-NEXT: TART +; CHECK-NEXT: [5:6] +; CHECK-NEXT: ND +; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: m + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i64, ptr addrspace(1) %gep + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { +; CHECK-LABEL: bcnt032_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: _i32_b32 s0, s0 +; CHECK-NEXT: 32 s1, 32, s0 +; CHECK-NEXT: g_u32 s1, 0 +; CHECK-NEXT: TART +; CHECK-NEXT: 0 +; CHECK-NEXT: ND +; CHECK-NEXT: TART +; CHECK-NEXT: 1 +; CHECK-NEXT: ND +; CHECK-NEXT: ct_b64 s[0:1], -1, 0 +; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: irstlane_b32 s0, v0 +; CHECK-NEXT: n to shader part epilog + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result) + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { +; CHECK-LABEL: bcnt064_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: _i32_b64 s0, s[0:1] +; CHECK-NEXT: 32 s2, 64, s0 +; CHECK-NEXT: u32 s3, 0, 0 +; CHECK-NEXT: 32 s1, 0 +; CHECK-NEXT: g_u64 s[2:3], 0 +; CHECK-NEXT: TART +; CHECK-NEXT: [0:1] +; CHECK-NEXT: ND +; CHECK-NEXT: ct_b64 s[0:1], -1, 0 +; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: irstlane_b32 s0, v0 +; CHECK-NEXT: TART +; CHECK-NEXT: [2:3] +; CHECK-NEXT: ND +; CHECK-NEXT: n to shader part epilog + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result) + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} \ No newline at end of file From b540b4cc12ec2dd2379838785f32405995d4cae8 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 27 Oct 2025 14:18:45 -0500 Subject: [PATCH 06/13] Reviewer-suggested refactoring --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 169541d9d45f6..94dcba7aab3e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -34,6 +34,7 @@ #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" #include @@ -1990,17 +1991,16 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, } bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { - uint32_t BitWidth, DestinationWidth, IntrinsicWidth; - if (!I.hasOneUse() || !I.getType()->isIntegerTy() || - !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) + uint32_t BitWidth, DestinationWidth; + if (!I.hasOneUse() || !I.getType()->isIntegerTy()) return false; - BinaryOperator *MustBeSub = dyn_cast(I.user_back()); - if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub) + BitWidth = I.getType()->getIntegerBitWidth(); + if(!ST.hasBCNT(BitWidth)) return false; - ConstantInt *FirstOperand = dyn_cast(MustBeSub->getOperand(0)); - if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth) + Instruction *MustBeSub = I.user_back(); + if (!match(MustBeSub, m_Sub(m_SpecificInt(BitWidth), m_Specific(&I)))) return false; IRBuilder<> Builder(MustBeSub); @@ -2009,14 +2009,12 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { : Intrinsic::amdgcn_bcnt32_lo, {}, {I.getArgOperand(0)}); - if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != - (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth())) - TransformedIns = cast(Builder.CreateZExtOrTrunc( - TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); + DestinationWidth = MustBeSub->getType()->getIntegerBitWidth(); + TransformedIns = cast(Builder.CreateZExtOrTrunc( + TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); - MustBeSub->replaceAllUsesWith(TransformedIns); - TransformedIns->takeName(MustBeSub); - MustBeSub->eraseFromParent(); + BasicBlock::iterator SubIt = MustBeSub->getIterator(); + ReplaceInstWithValue(SubIt,TransformedIns); return true; } From e82de8038828212f6cbebfe7cf5e304803d901bd Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Tue, 28 Oct 2025 17:30:52 -0500 Subject: [PATCH 07/13] Revert implementation --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 41 ------------------- 2 files changed, 47 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 34e23c9c07f3a..8e35109061792 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,12 +2359,6 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; -def int_amdgcn_bcnt32_lo : - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - -def int_amdgcn_bcnt64_lo : - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; - // llvm.amdgcn.ds.swizzle src offset def int_amdgcn_ds_swizzle : ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 94dcba7aab3e2..8e35ba77d69aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -26,7 +26,6 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -34,10 +33,8 @@ #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" -#include #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -96,13 +93,6 @@ static cl::opt DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -// Disable bitsin(typeof(x)) - popcnt(x) to s_bcnt0(x) transformation. -static cl::opt - DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", - cl::desc("Prevent transforming bitsin(typeof(x)) - " - "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"), - cl::ReallyHidden, cl::init(false)); - class AMDGPUCodeGenPrepareImpl : public InstVisitor { public: @@ -268,7 +258,6 @@ class AMDGPUCodeGenPrepareImpl bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); - bool visitCtpop(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -1921,8 +1910,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); - case Intrinsic::ctpop: - return visitCtpop(I); default: return false; } @@ -1990,34 +1977,6 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, return insertValues(Builder, FractArg->getType(), ResultVals); } -bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { - uint32_t BitWidth, DestinationWidth; - if (!I.hasOneUse() || !I.getType()->isIntegerTy()) - return false; - - BitWidth = I.getType()->getIntegerBitWidth(); - if(!ST.hasBCNT(BitWidth)) - return false; - - Instruction *MustBeSub = I.user_back(); - if (!match(MustBeSub, m_Sub(m_SpecificInt(BitWidth), m_Specific(&I)))) - return false; - - IRBuilder<> Builder(MustBeSub); - Instruction *TransformedIns = - Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt64_lo - : Intrinsic::amdgcn_bcnt32_lo, - {}, {I.getArgOperand(0)}); - - DestinationWidth = MustBeSub->getType()->getIntegerBitWidth(); - TransformedIns = cast(Builder.CreateZExtOrTrunc( - TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); - - BasicBlock::iterator SubIt = MustBeSub->getIterator(); - ReplaceInstWithValue(SubIt,TransformedIns); - return true; -} - bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *FractArg = matchFractPat(I); if (!FractArg) From 2fe905a2de412cca14352ebcb2e32c7b2b6da355 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Tue, 28 Oct 2025 19:11:33 -0500 Subject: [PATCH 08/13] Use S-expressions instead --- llvm/lib/Target/AMDGPU/SOPInstructions.td | 13 ++++-- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 56 +++++++++++------------ 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e034b1f5333af..bbc278f201141 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -265,11 +265,9 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", - [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] ->; -def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", - [(set i32:$sdst, (UniformUnaryFrag i64:$src0))] + [(set i32:$sdst, (UniformBinFrag 32, (UniformUnaryFrag i32:$src0)))] >; +def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] >; @@ -1889,6 +1887,13 @@ def : GCNPat < (S_MOV_B32 (i32 0)), sub1)) >; +def : GCNPat < + (i64 (UniformBinFrag 64, (UniformUnaryFrag i64:$src))), + (i64 (REG_SEQUENCE SReg_64, + (i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0, + (S_MOV_B32 (i32 0)), sub1)) +>; + def : GCNPat < (i32 (UniformBinFrag i32:$x, (i32 (ineg i32:$x)))), (S_ABS_I32 SReg_32:$x) diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 1a6f32ef7e617..8e7b8c530d4f4 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -480,6 +480,7 @@ define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] ; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -697,19 +698,18 @@ define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspac define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { ; CHECK-LABEL: bcnt032_ctpop_multiple_uses: ; CHECK: ; %bb.0: -; CHECK-NEXT: _i32_b32 s0, s0 -; CHECK-NEXT: 32 s1, 32, s0 -; CHECK-NEXT: g_u32 s1, 0 -; CHECK-NEXT: TART -; CHECK-NEXT: 0 -; CHECK-NEXT: ND -; CHECK-NEXT: TART -; CHECK-NEXT: 1 -; CHECK-NEXT: ND -; CHECK-NEXT: ct_b64 s[0:1], -1, 0 -; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: irstlane_b32 s0, v0 -; CHECK-NEXT: n to shader part epilog +; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s1 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result) @@ -722,21 +722,21 @@ define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { ; CHECK-LABEL: bcnt064_ctpop_multiple_uses: ; CHECK: ; %bb.0: -; CHECK-NEXT: _i32_b64 s0, s[0:1] -; CHECK-NEXT: 32 s2, 64, s0 -; CHECK-NEXT: u32 s3, 0, 0 -; CHECK-NEXT: 32 s1, 0 -; CHECK-NEXT: g_u64 s[2:3], 0 -; CHECK-NEXT: TART -; CHECK-NEXT: [0:1] -; CHECK-NEXT: ND -; CHECK-NEXT: ct_b64 s[0:1], -1, 0 -; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: irstlane_b32 s0, v0 -; CHECK-NEXT: TART -; CHECK-NEXT: [2:3] -; CHECK-NEXT: ND -; CHECK-NEXT: n to shader part epilog +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result) From 25cc6e3339c34849f0c9d510fc45d2128edc4b21 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Wed, 29 Oct 2025 12:35:46 -0500 Subject: [PATCH 09/13] Review changes --- llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 +- llvm/test/CodeGen/AMDGPU/s_bcnt0.ll | 110 ++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 108 --------------------- 3 files changed, 112 insertions(+), 110 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/s_bcnt0.ll diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index bbc278f201141..68f20cae50825 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -265,7 +265,7 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", - [(set i32:$sdst, (UniformBinFrag 32, (UniformUnaryFrag i32:$src0)))] + [(set i32:$sdst, (UniformBinFrag 32, (ctpop i32:$src0)))] >; def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", @@ -1888,7 +1888,7 @@ def : GCNPat < >; def : GCNPat < - (i64 (UniformBinFrag 64, (UniformUnaryFrag i64:$src))), + (i64 (UniformBinFrag 64, (ctpop i64:$src))), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 (i32 0)), sub1)) diff --git a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll new file mode 100644 index 0000000000000..a73a12ece94f3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s + +define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt032_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dword v2, v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v3 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dword v[0:1], v2, off +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i32, ptr addrspace(1) %gep + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt064_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: b32 s0, s0, 2 +; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc +; CHECK-NEXT: nt vmcnt(0) +; CHECK-NEXT: 32_e32 v4, 0 +; CHECK-NEXT: u32_b32 v2, v2, 0 +; CHECK-NEXT: u32_b32 v3, v3, v2 +; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 +; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc +; CHECK-NEXT: TART +; CHECK-NEXT: [5:6] +; CHECK-NEXT: ND +; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: m + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i64, ptr addrspace(1) %gep + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { +; CHECK-LABEL: bcnt032_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s1 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result) + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { +; CHECK-LABEL: bcnt064_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; return to shader part epilog + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result) + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 8e7b8c530d4f4..1e1a2a257cd87 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -636,112 +636,4 @@ if: endif: ret i32 1 -} - -define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; CHECK-LABEL: bcnt032_not_for_vregs: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: global_load_dword v2, v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 -; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v3 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: global_store_dword v[0:1], v2, off -; CHECK-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid - %val0 = load volatile i32, ptr addrspace(1) %gep - %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %result2 = sub i32 32, %result - call void asm "; use $0", "s"(i32 %result2) - %cmp = icmp ne i32 %result2, 0 - %zext = zext i1 %cmp to i32 - store i32 %result, ptr addrspace(1) %out - ret void -} - -define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; CHECK-LABEL: bcnt064_not_for_vregs: -; CHECK: ; %bb.0: -; CHECK-NEXT: b32 s0, s0, 2 -; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc -; CHECK-NEXT: nt vmcnt(0) -; CHECK-NEXT: 32_e32 v4, 0 -; CHECK-NEXT: u32_b32 v2, v2, 0 -; CHECK-NEXT: u32_b32 v3, v3, v2 -; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 -; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc -; CHECK-NEXT: TART -; CHECK-NEXT: [5:6] -; CHECK-NEXT: ND -; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off -; CHECK-NEXT: m - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid - %val0 = load volatile i64, ptr addrspace(1) %gep - %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone - %result2 = sub i64 64, %result - call void asm "; use $0", "s"(i64 %result2) - %cmp = icmp ne i64 %result2, 0 - %zext = zext i1 %cmp to i32 - store i64 %result, ptr addrspace(1) %out - ret void -} - -define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { -; CHECK-LABEL: bcnt032_ctpop_multiple_uses: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 -; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s1 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog - %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %result2 = sub i32 32, %result - call void asm "; use $0", "s"(i32 %result) - call void asm "; use $0", "s"(i32 %result2) - %cmp = icmp ne i32 %result2, 0 - %zext = zext i1 %cmp to i32 - ret i32 %zext -} - -define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { -; CHECK-LABEL: bcnt064_ctpop_multiple_uses: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s3, 0 -; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; return to shader part epilog - %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone - %result2 = sub i64 64, %result - call void asm "; use $0", "s"(i64 %result) - call void asm "; use $0", "s"(i64 %result2) - %cmp = icmp ne i64 %result2, 0 - %zext = zext i1 %cmp to i32 - ret i32 %zext } \ No newline at end of file From 00beb852ad3cec69197856f09f4a1733ba3c3198 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Wed, 29 Oct 2025 12:37:29 -0500 Subject: [PATCH 10/13] Newline --- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 1e1a2a257cd87..b3c5042bb98fe 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -636,4 +636,4 @@ if: endif: ret i32 1 -} \ No newline at end of file +} From 719af875f52bab68ce5987ef0e48bb6cb656d1bb Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 31 Oct 2025 16:10:03 -0500 Subject: [PATCH 11/13] Fix testcases --- llvm/test/CodeGen/AMDGPU/s_bcnt0.ll | 114 ++++++++++++++-------------- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 36 ++++----- 2 files changed, 75 insertions(+), 75 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll index a73a12ece94f3..d26d12d821026 100644 --- a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll @@ -3,19 +3,19 @@ define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CHECK-LABEL: bcnt032_not_for_vregs: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: global_load_dword v2, v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 -; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v3 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: global_store_dword v[0:1], v2, off -; CHECK-NEXT: s_endpgm +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dword v2, v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v3 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dword v[0:1], v2, off +; CHECK-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid %val0 = load volatile i32, ptr addrspace(1) %gep @@ -30,22 +30,22 @@ define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspac define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CHECK-LABEL: bcnt064_not_for_vregs: -; CHECK: ; %bb.0: -; CHECK-NEXT: b32 s0, s0, 2 -; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc -; CHECK-NEXT: nt vmcnt(0) -; CHECK-NEXT: 32_e32 v4, 0 -; CHECK-NEXT: u32_b32 v2, v2, 0 -; CHECK-NEXT: u32_b32 v3, v3, v2 -; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 -; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc -; CHECK-NEXT: TART -; CHECK-NEXT: [5:6] -; CHECK-NEXT: ND -; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off -; CHECK-NEXT: m +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_bcnt_u32_b32 v3, v3, v2 +; CHECK-NEXT: v_sub_co_u32_e32 v5, vcc, 64, v3 +; CHECK-NEXT: v_subb_co_u32_e64 v6, s[0:1], 0, 0, vcc +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v[5:6] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid %val0 = load volatile i64, ptr addrspace(1) %gep @@ -61,18 +61,18 @@ define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspac define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { ; CHECK-LABEL: bcnt032_ctpop_multiple_uses: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 -; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s1 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s1 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result) @@ -85,21 +85,21 @@ define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { ; CHECK-LABEL: bcnt064_ctpop_multiple_uses: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s3, 0 -; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result) @@ -107,4 +107,4 @@ define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { %cmp = icmp ne i64 %result2, 0 %zext = zext i1 %cmp to i32 ret i32 %zext -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index b3c5042bb98fe..fb48f5706feeb 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -459,14 +459,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) { define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { ; CHECK-LABEL: bcnt032: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result2) @@ -478,16 +478,16 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { ; CHECK-LABEL: bcnt064: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result2) From 675b5fbc68621eb7cd0facf26233621a8790dd89 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Tue, 4 Nov 2025 12:06:40 -0500 Subject: [PATCH 12/13] Change testcases --- llvm/test/CodeGen/AMDGPU/s_bcnt0.ll | 64 +++++++++++------------------ 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll index d26d12d821026..1ef7648f7da4b 100644 --- a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll @@ -1,61 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +define amdgpu_ps i32 @bcnt032_not_for_vregs(i64 %val) { ; CHECK-LABEL: bcnt032_not_for_vregs: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: global_load_dword v2, v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 -; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 +; CHECK-NEXT: v_bcnt_u32_b32 v0, v0, 0 +; CHECK-NEXT: v_sub_u32_e32 v0, 32, v0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v3 +; CHECK-NEXT: ; use v0 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: global_store_dword v[0:1], v2, off -; CHECK-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid - %val0 = load volatile i32, ptr addrspace(1) %gep - %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %val0 = trunc i64 %val to i32 + %result = call i32 @llvm.ctpop.i32(i32 %val0) %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result2) %cmp = icmp ne i32 %result2, 0 %zext = zext i1 %cmp to i32 - store i32 %result, ptr addrspace(1) %out - ret void + ret i32 %zext } -define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +define amdgpu_ps i32 @bcnt064_not_for_vregs(i64 %val0) { ; CHECK-LABEL: bcnt064_not_for_vregs: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 -; CHECK-NEXT: v_bcnt_u32_b32 v3, v3, v2 -; CHECK-NEXT: v_sub_co_u32_e32 v5, vcc, 64, v3 -; CHECK-NEXT: v_subb_co_u32_e64 v6, s[0:1], 0, 0, vcc +; CHECK-NEXT: v_bcnt_u32_b32 v0, v0, 0 +; CHECK-NEXT: v_bcnt_u32_b32 v0, v1, v0 +; CHECK-NEXT: v_sub_co_u32_e32 v0, vcc, 64, v0 +; CHECK-NEXT: v_subb_co_u32_e64 v1, s[0:1], 0, 0, vcc +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[5:6] +; CHECK-NEXT: ; use v[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[3:4], off -; CHECK-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid - %val0 = load volatile i64, ptr addrspace(1) %gep - %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i64 @llvm.ctpop.i64(i64 %val0) %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result2) %cmp = icmp ne i64 %result2, 0 %zext = zext i1 %cmp to i32 - store i64 %result, ptr addrspace(1) %out - ret void + ret i32 %zext } define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { @@ -73,7 +59,7 @@ define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: ; return to shader part epilog - %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result = call i32 @llvm.ctpop.i32(i32 %val0) %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result) call void asm "; use $0", "s"(i32 %result2) @@ -100,7 +86,7 @@ define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { ; CHECK-NEXT: ; use s[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ; return to shader part epilog - %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result = call i64 @llvm.ctpop.i64(i64 %val0) %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result) call void asm "; use $0", "s"(i64 %result2) From 71687c2c61699b7825ed2e51efb24dd6d8769567 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Wed, 5 Nov 2025 15:20:21 -0500 Subject: [PATCH 13/13] Testcase updates (maybe revert this) --- llvm/test/CodeGen/AMDGPU/s_bcnt0.ll | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll index 1ef7648f7da4b..89bef87f900f2 100644 --- a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll @@ -1,44 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -define amdgpu_ps i32 @bcnt032_not_for_vregs(i64 %val) { +define i32 @bcnt032_not_for_vregs(i32 %val0) { ; CHECK-LABEL: bcnt032_not_for_vregs: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_bcnt_u32_b32 v0, v0, 0 -; CHECK-NEXT: v_sub_u32_e32 v0, 32, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v0 -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog - %val0 = trunc i64 %val to i32 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.ctpop.i32(i32 %val0) %result2 = sub i32 32, %result - call void asm "; use $0", "s"(i32 %result2) %cmp = icmp ne i32 %result2, 0 %zext = zext i1 %cmp to i32 ret i32 %zext } -define amdgpu_ps i32 @bcnt064_not_for_vregs(i64 %val0) { +define i32 @bcnt064_not_for_vregs(i64 %val0) { ; CHECK-LABEL: bcnt064_not_for_vregs: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; CHECK-NEXT: v_bcnt_u32_b32 v0, v1, v0 -; CHECK-NEXT: v_sub_co_u32_e32 v0, vcc, 64, v0 -; CHECK-NEXT: v_subb_co_u32_e64 v1, s[0:1], 0, 0, vcc -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:1] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 64, v[0:1] ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ctpop.i64(i64 %val0) %result2 = sub i64 64, %result - call void asm "; use $0", "s"(i64 %result2) %cmp = icmp ne i64 %result2, 0 %zext = zext i1 %cmp to i32 ret i32 %zext