Skip to content

Commit 165f82d

Browse files
committed
Review changes:
- Add tests - Remove builtin (users will need inline assembly if pattern match fails)
1 parent 1030ef3 commit 165f82d

File tree

3 files changed

+109
-5
lines changed

3 files changed

+109
-5
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,6 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
6363
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
6464
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
6565

66-
BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc")
67-
BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc")
68-
6966
TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
7067

7168
//===----------------------------------------------------------------------===//

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2360,11 +2360,9 @@ def int_amdgcn_mbcnt_hi :
23602360
[IntrNoMem]>;
23612361

23622362
def int_amdgcn_bcnt32_lo :
2363-
ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">,
23642363
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
23652364

23662365
def int_amdgcn_bcnt64_lo :
2367-
ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">,
23682366
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
23692367

23702368
// llvm.amdgcn.ds.swizzle src offset

llvm/test/CodeGen/AMDGPU/s_cmp_0.ll

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,3 +621,112 @@ if:
621621
endif:
622622
ret i32 1
623623
}
624+
625+
define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
626+
; CHECK-LABEL: bcnt032_not_for_vregs:
627+
; CHECK: ; %bb.0:
628+
; CHECK-NEXT: s_lshl_b32 s0, s0, 2
629+
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
630+
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
631+
; CHECK-NEXT: global_load_dword v2, v[2:3], off glc
632+
; CHECK-NEXT: s_waitcnt vmcnt(0)
633+
; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0
634+
; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2
635+
; CHECK-NEXT: ;;#ASMSTART
636+
; CHECK-NEXT: ; use v3
637+
; CHECK-NEXT: ;;#ASMEND
638+
; CHECK-NEXT: global_store_dword v[0:1], v2, off
639+
; CHECK-NEXT: s_endpgm
640+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
641+
%gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
642+
%val0 = load volatile i32, ptr addrspace(1) %gep
643+
%result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
644+
%result2 = sub i32 32, %result
645+
call void asm "; use $0", "s"(i32 %result2)
646+
%cmp = icmp ne i32 %result2, 0
647+
%zext = zext i1 %cmp to i32
648+
store i32 %result, ptr addrspace(1) %out
649+
ret void
650+
}
651+
652+
define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
653+
; CHECK-LABEL: bcnt064_not_for_vregs:
654+
; CHECK: ; %bb.0:
655+
; CHECK-NEXT: b32 s0, s0, 2
656+
; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2
657+
; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc
658+
; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc
659+
; CHECK-NEXT: nt vmcnt(0)
660+
; CHECK-NEXT: 32_e32 v4, 0
661+
; CHECK-NEXT: u32_b32 v2, v2, 0
662+
; CHECK-NEXT: u32_b32 v3, v3, v2
663+
; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3
664+
; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc
665+
; CHECK-NEXT: TART
666+
; CHECK-NEXT: [5:6]
667+
; CHECK-NEXT: ND
668+
; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off
669+
; CHECK-NEXT: m
670+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
671+
%gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
672+
%val0 = load volatile i64, ptr addrspace(1) %gep
673+
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
674+
%result2 = sub i64 64, %result
675+
call void asm "; use $0", "s"(i64 %result2)
676+
%cmp = icmp ne i64 %result2, 0
677+
%zext = zext i1 %cmp to i32
678+
store i64 %result, ptr addrspace(1) %out
679+
ret void
680+
}
681+
682+
define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
683+
; CHECK-LABEL: bcnt032_ctpop_multiple_uses:
684+
; CHECK: ; %bb.0:
685+
; CHECK-NEXT: _i32_b32 s0, s0
686+
; CHECK-NEXT: 32 s1, 32, s0
687+
; CHECK-NEXT: g_u32 s1, 0
688+
; CHECK-NEXT: TART
689+
; CHECK-NEXT: 0
690+
; CHECK-NEXT: ND
691+
; CHECK-NEXT: TART
692+
; CHECK-NEXT: 1
693+
; CHECK-NEXT: ND
694+
; CHECK-NEXT: ct_b64 s[0:1], -1, 0
695+
; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1]
696+
; CHECK-NEXT: irstlane_b32 s0, v0
697+
; CHECK-NEXT: n to shader part epilog
698+
%result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
699+
%result2 = sub i32 32, %result
700+
call void asm "; use $0", "s"(i32 %result)
701+
call void asm "; use $0", "s"(i32 %result2)
702+
%cmp = icmp ne i32 %result2, 0
703+
%zext = zext i1 %cmp to i32
704+
ret i32 %zext
705+
}
706+
707+
define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) {
708+
; CHECK-LABEL: bcnt064_ctpop_multiple_uses:
709+
; CHECK: ; %bb.0:
710+
; CHECK-NEXT: _i32_b64 s0, s[0:1]
711+
; CHECK-NEXT: 32 s2, 64, s0
712+
; CHECK-NEXT: u32 s3, 0, 0
713+
; CHECK-NEXT: 32 s1, 0
714+
; CHECK-NEXT: g_u64 s[2:3], 0
715+
; CHECK-NEXT: TART
716+
; CHECK-NEXT: [0:1]
717+
; CHECK-NEXT: ND
718+
; CHECK-NEXT: ct_b64 s[0:1], -1, 0
719+
; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1]
720+
; CHECK-NEXT: irstlane_b32 s0, v0
721+
; CHECK-NEXT: TART
722+
; CHECK-NEXT: [2:3]
723+
; CHECK-NEXT: ND
724+
; CHECK-NEXT: n to shader part epilog
725+
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
726+
%result2 = sub i64 64, %result
727+
call void asm "; use $0", "s"(i64 %result)
728+
call void asm "; use $0", "s"(i64 %result2)
729+
%cmp = icmp ne i64 %result2, 0
730+
%zext = zext i1 %cmp to i32
731+
ret i32 %zext
732+
}

0 commit comments

Comments
 (0)