-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 #164847
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9682587
5d475c6
1396fb7
11918fd
db5c6f9
b540b4c
e82de80
2fe905a
25cc6e3
00beb85
719af87
675b5fb
71687c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a GlobalISel run line to check that the patterns work there too?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jayfoad, the negative tests crash Global ISel, so I can't add a check unless I break out the positive tests to a separate file. I'll do that if you like, but I think a better approach would be to file a JIRA issue to look into that.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm surprised anything here breaks globalisel. I'd expect this of all things to work better. What is the crash?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It requires the test WITHOUT your suggested changes to cause the crash. I updated them per your comments, but please let me know if you'd like me to revert that commit to preserve the crashing testcase. (I can add an XFAIL line.)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is failing on the broken v-to-s copy for the asm input. I think this is the global isel flavored version of the case the DAG path miscompiles into ignoring the SGPR constraint |
||
|
|
||
| define i32 @bcnt032_not_for_vregs(i32 %val0) { | ||
| ; CHECK-LABEL: bcnt032_not_for_vregs: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_bcnt_u32_b32 v0, v0, 0 | ||
| ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 | ||
| ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %result = call i32 @llvm.ctpop.i32(i32 %val0) | ||
| %result2 = sub i32 32, %result | ||
| %cmp = icmp ne i32 %result2, 0 | ||
| %zext = zext i1 %cmp to i32 | ||
| ret i32 %zext | ||
| } | ||
|
|
||
| define i32 @bcnt064_not_for_vregs(i64 %val0) { | ||
| ; CHECK-LABEL: bcnt064_not_for_vregs: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_bcnt_u32_b32 v0, v0, 0 | ||
| ; CHECK-NEXT: v_bcnt_u32_b32 v0, v1, v0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v1, 0 | ||
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 64, v[0:1] | ||
| ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %result = call i64 @llvm.ctpop.i64(i64 %val0) | ||
| %result2 = sub i64 64, %result | ||
| %cmp = icmp ne i64 %result2, 0 | ||
| %zext = zext i1 %cmp to i32 | ||
| ret i32 %zext | ||
| } | ||
|
|
||
| define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { | ||
| ; CHECK-LABEL: bcnt032_ctpop_multiple_uses: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 | ||
| ; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 | ||
|
Comment on lines
+39
to
+40
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure we should do this in the multiple use case. It's not worse, but it's trading for an equivalently good instruction
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @arsenm I thought so, too, but they're actually not equivalent because of
|
||
| ; CHECK-NEXT: ;;#ASMSTART | ||
| ; CHECK-NEXT: ; use s1 | ||
| ; CHECK-NEXT: ;;#ASMEND | ||
| ; CHECK-NEXT: ;;#ASMSTART | ||
| ; CHECK-NEXT: ; use s0 | ||
| ; CHECK-NEXT: ;;#ASMEND | ||
| ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 | ||
| ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] | ||
| ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 | ||
| ; CHECK-NEXT: ; return to shader part epilog | ||
| %result = call i32 @llvm.ctpop.i32(i32 %val0) | ||
| %result2 = sub i32 32, %result | ||
| call void asm "; use $0", "s"(i32 %result) | ||
| call void asm "; use $0", "s"(i32 %result2) | ||
| %cmp = icmp ne i32 %result2, 0 | ||
| %zext = zext i1 %cmp to i32 | ||
| ret i32 %zext | ||
| } | ||
|
|
||
| define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { | ||
| ; CHECK-LABEL: bcnt064_ctpop_multiple_uses: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_mov_b32 s3, 0 | ||
| ; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] | ||
| ; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] | ||
| ; CHECK-NEXT: s_mov_b32 s1, s3 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 | ||
| ; CHECK-NEXT: ;;#ASMSTART | ||
| ; CHECK-NEXT: ; use s[0:1] | ||
| ; CHECK-NEXT: ;;#ASMEND | ||
| ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 | ||
| ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] | ||
| ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 | ||
| ; CHECK-NEXT: ;;#ASMSTART | ||
| ; CHECK-NEXT: ; use s[2:3] | ||
| ; CHECK-NEXT: ;;#ASMEND | ||
| ; CHECK-NEXT: ; return to shader part epilog | ||
| %result = call i64 @llvm.ctpop.i64(i64 %val0) | ||
| %result2 = sub i64 64, %result | ||
| call void asm "; use $0", "s"(i64 %result) | ||
| call void asm "; use $0", "s"(i64 %result2) | ||
| %cmp = icmp ne i64 %result2, 0 | ||
| %zext = zext i1 %cmp to i32 | ||
| ret i32 %zext | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will need to put in AMDGPUInstructions.td
def ctpop_oneuse : HasOneUseUnaryOp<ctpop>;There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm see #164847 (comment)