-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] Try to reuse in v_cndmask register with constant from compare. #131146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
1f8eabd
ffad027
c5881f2
51827a6
ed642aa
1ba6191
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,171 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9 | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10 | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about with inline immediate? It shouldn't be any better? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the cases the immediate is already inlined into v_cndmask if it was possible. E.g. in the test they are successfully inlined for gfx1030 before the moment when the patch processes v_cndmask. In such cases it just skips the instruction. |
||
| define float @f32_oeq_v_i(float %arg, float %arg1) { | ||
| ; GFX9-LABEL: f32_oeq_v_i: | ||
| ; GFX9: ; %bb.0: ; %bb | ||
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148 | ||
| ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 | ||
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc | ||
| ; GFX9-NEXT: s_setpc_b64 s[30:31] | ||
| ; | ||
| ; GFX10-LABEL: f32_oeq_v_i: | ||
| ; GFX10: ; %bb.0: ; %bb | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0 | ||
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo | ||
| ; GFX10-NEXT: s_setpc_b64 s[30:31] | ||
| bb: | ||
| %fcmp = fcmp oeq float %arg, 0x3FCF5C2900000000 | ||
| %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1 | ||
| ret float %select | ||
| } | ||
|
|
||
| define float @f32_oeq_i_v(float %arg, float %arg1) { | ||
| ; GFX9-LABEL: f32_oeq_i_v: | ||
| ; GFX9: ; %bb.0: ; %bb | ||
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148 | ||
| ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 | ||
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc | ||
| ; GFX9-NEXT: s_setpc_b64 s[30:31] | ||
| ; | ||
| ; GFX10-LABEL: f32_oeq_i_v: | ||
| ; GFX10: ; %bb.0: ; %bb | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0 | ||
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo | ||
| ; GFX10-NEXT: s_setpc_b64 s[30:31] | ||
| bb: | ||
| %fcmp = fcmp oeq float 0x3FCF5C2900000000, %arg | ||
| %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1 | ||
| ret float %select | ||
| } | ||
|
|
||
| define float @f32_one_v_i(float %arg, float %arg1) { | ||
| ; GFX9-LABEL: f32_one_v_i: | ||
| ; GFX9: ; %bb.0: ; %bb | ||
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148 | ||
| ; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 | ||
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc | ||
| ; GFX9-NEXT: s_setpc_b64 s[30:31] | ||
| ; | ||
| ; GFX10-LABEL: f32_one_v_i: | ||
| ; GFX10: ; %bb.0: ; %bb | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0 | ||
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo | ||
| ; GFX10-NEXT: s_setpc_b64 s[30:31] | ||
| bb: | ||
| %fcmp = fcmp one float %arg, 0x3FCF5C2900000000 | ||
| %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000 | ||
| ret float %select | ||
| } | ||
|
|
||
| define float @f32_one_i_v(float %arg, float %arg1) { | ||
| ; GFX9-LABEL: f32_one_i_v: | ||
| ; GFX9: ; %bb.0: ; %bb | ||
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148 | ||
| ; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 | ||
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc | ||
| ; GFX9-NEXT: s_setpc_b64 s[30:31] | ||
| ; | ||
| ; GFX10-LABEL: f32_one_i_v: | ||
| ; GFX10: ; %bb.0: ; %bb | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0 | ||
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo | ||
| ; GFX10-NEXT: s_setpc_b64 s[30:31] | ||
| bb: | ||
| %fcmp = fcmp one float %arg, 0x3FCF5C2900000000 | ||
| %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000 | ||
| ret float %select | ||
| } | ||
|
|
||
| define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) { | ||
| ; GFX9-LABEL: i32_eq_v_i: | ||
| ; GFX9: ; %bb.0: ; %bb | ||
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX9-NEXT: s_mov_b32 s4, 0x67932 | ||
| ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 | ||
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc | ||
| ; GFX9-NEXT: s_setpc_b64 s[30:31] | ||
| ; | ||
| ; GFX10-LABEL: i32_eq_v_i: | ||
| ; GFX10: ; %bb.0: ; %bb | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0 | ||
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo | ||
| ; GFX10-NEXT: s_setpc_b64 s[30:31] | ||
| bb: | ||
| %icmp = icmp eq i32 %arg, 424242 | ||
| %select = select i1 %icmp, i32 424242, i32 %arg1 | ||
| ret i32 %select | ||
| } | ||
|
|
||
| define i32 @i32_eq_i_v(i32 %arg, i32 %arg1) { | ||
| ; GFX9-LABEL: i32_eq_i_v: | ||
| ; GFX9: ; %bb.0: ; %bb | ||
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX9-NEXT: s_mov_b32 s4, 0x67932 | ||
| ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 | ||
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc | ||
| ; GFX9-NEXT: s_setpc_b64 s[30:31] | ||
| ; | ||
| ; GFX10-LABEL: i32_eq_i_v: | ||
| ; GFX10: ; %bb.0: ; %bb | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0 | ||
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo | ||
| ; GFX10-NEXT: s_setpc_b64 s[30:31] | ||
| bb: | ||
| %icmp = icmp eq i32 424242, %arg | ||
| %select = select i1 %icmp, i32 424242, i32 %arg1 | ||
| ret i32 %select | ||
| } | ||
|
|
||
| define i32 @i32_ne_v_i(i32 %arg, i32 %arg1) { | ||
| ; GFX9-LABEL: i32_ne_v_i: | ||
| ; GFX9: ; %bb.0: ; %bb | ||
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX9-NEXT: s_mov_b32 s4, 0x67932 | ||
| ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 | ||
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc | ||
| ; GFX9-NEXT: s_setpc_b64 s[30:31] | ||
| ; | ||
| ; GFX10-LABEL: i32_ne_v_i: | ||
| ; GFX10: ; %bb.0: ; %bb | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0 | ||
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo | ||
| ; GFX10-NEXT: s_setpc_b64 s[30:31] | ||
| bb: | ||
| %icmp = icmp ne i32 %arg, 424242 | ||
| %select = select i1 %icmp, i32 %arg1, i32 424242 | ||
| ret i32 %select | ||
| } | ||
|
|
||
| define i32 @i32_ne_i_v(i32 %arg, i32 %arg1) { | ||
| ; GFX9-LABEL: i32_ne_i_v: | ||
| ; GFX9: ; %bb.0: ; %bb | ||
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX9-NEXT: s_mov_b32 s4, 0x67932 | ||
| ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 | ||
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc | ||
| ; GFX9-NEXT: s_setpc_b64 s[30:31] | ||
| ; | ||
| ; GFX10-LABEL: i32_ne_i_v: | ||
| ; GFX10: ; %bb.0: ; %bb | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0 | ||
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo | ||
| ; GFX10-NEXT: s_setpc_b64 s[30:31] | ||
| bb: | ||
| %icmp = icmp ne i32 424242, %arg | ||
| %select = select i1 %icmp, i32 %arg1, i32 424242 | ||
| ret i32 %select | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test half, i16, and bfloat cases. Plus 64-bit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I added half and i16 types. For bloat case we'll get no advantage here, since the imm is stored in two registers in any case: one (shifted left) for compare and second (original) for cndmask, like: For the 64-bit types this folding doesn't work yet, since they are lowered into different pattern cmp/cndmask with pairs of registers (with REG_SEQUENCEs). I would implement it incrementally in additional patch. |
||
Uh oh!
There was an error while loading. Please reload this page.